diff --git a/crypto/CMakeLists.txt b/crypto/CMakeLists.txt index b1ca70e1..e53885eb 100644 --- a/crypto/CMakeLists.txt +++ b/crypto/CMakeLists.txt @@ -105,6 +105,7 @@ if(${ARCH} STREQUAL "arm") chacha/chacha-armv4.${ASM_EXT} curve25519/asm/x25519-asm-arm.S poly1305/poly1305_arm_asm.S + hrss/asm/poly_mul_vec_armv7_neon.S ) endif() @@ -131,6 +132,7 @@ if(${ARCH} STREQUAL "x86_64") chacha/chacha-x86_64.${ASM_EXT} cipher_extra/aes128gcmsiv-x86_64.${ASM_EXT} cipher_extra/chacha20_poly1305_x86_64.${ASM_EXT} + hrss/asm/poly_rq_mul.S ) endif() @@ -275,6 +277,7 @@ add_library( evp/sign.c ex_data.c hkdf/hkdf.c + hrss/hrss.c lhash/lhash.c mem.c obj/obj.c @@ -455,6 +458,7 @@ add_executable( fipsmodule/rand/ctrdrbg_test.cc hkdf/hkdf_test.cc hmac_extra/hmac_test.cc + hrss/hrss_test.cc lhash/lhash_test.cc obj/obj_test.cc pem/pem_test.cc diff --git a/crypto/hrss/asm/poly_mul_vec_armv7_neon.S b/crypto/hrss/asm/poly_mul_vec_armv7_neon.S new file mode 100644 index 00000000..93d491c2 --- /dev/null +++ b/crypto/hrss/asm/poly_mul_vec_armv7_neon.S @@ -0,0 +1,4260 @@ +// Copyright (c) 2018, Google Inc. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +// This file is produced by compiling hrss.c with Clang and -mfpu=neon, and +// then trimming the output to just include the vectorised functions. + +#if !defined(OPENSSL_NO_ASM) && !defined(__ARM_NEON__) + + .text + .syntax unified + .eabi_attribute 67, "2.09" @ Tag_conformance + .eabi_attribute 6, 10 @ Tag_CPU_arch + .eabi_attribute 7, 65 @ Tag_CPU_arch_profile + .eabi_attribute 8, 1 @ Tag_ARM_ISA_use + .eabi_attribute 9, 2 @ Tag_THUMB_ISA_use + .fpu neon + .eabi_attribute 34, 1 @ Tag_CPU_unaligned_access + .eabi_attribute 15, 1 @ Tag_ABI_PCS_RW_data + .eabi_attribute 16, 1 @ Tag_ABI_PCS_RO_data + .eabi_attribute 17, 2 @ Tag_ABI_PCS_GOT_use + .eabi_attribute 20, 1 @ Tag_ABI_FP_denormal + .eabi_attribute 21, 1 @ Tag_ABI_FP_exceptions + .eabi_attribute 23, 3 @ Tag_ABI_FP_number_model + .eabi_attribute 24, 1 @ Tag_ABI_align_needed + .eabi_attribute 25, 1 @ Tag_ABI_align_preserved + .eabi_attribute 38, 1 @ Tag_ABI_FP_16bit_format + .eabi_attribute 18, 4 @ Tag_ABI_PCS_wchar_t + .eabi_attribute 26, 2 @ Tag_ABI_enum_size + .eabi_attribute 14, 0 @ Tag_ABI_PCS_R9_use + .file "hrss.c" + + .section .text.poly3_invert_vec,"ax",%progbits + .hidden poly3_invert_vec @ -- Begin function poly3_invert_vec + .globl poly3_invert_vec + .p2align 4 + .type poly3_invert_vec,%function + .code 16 @ @poly3_invert_vec + .thumb_func +poly3_invert_vec: +.Lfunc_begin0: + .file 1 "../crypto/hrss/hrss.c" + .loc 1 718 0 @ ../crypto/hrss/hrss.c:718:0 + .fnstart + .cfi_sections .debug_frame + .cfi_startproc +@ %bb.0: + .save {r4, r5, r6, r7, lr} + push {r4, r5, r6, r7, lr} + .cfi_def_cfa_offset 20 + .cfi_offset lr, -4 + .cfi_offset r7, -8 + .cfi_offset r6, -12 + .cfi_offset r5, -16 + .cfi_offset r4, -20 + .setfp r7, sp, #12 + add r7, sp, #12 + .cfi_def_cfa r7, 8 + .save {r8, r9, r10} + push.w {r8, r9, r10} + .cfi_offset r10, -24 + .cfi_offset r9, -28 + .cfi_offset r8, -32 + .vsave {d8, d9, d10, d11, d12, d13, d14, d15} + vpush {d8, d9, d10, d11, d12, d13, d14, d15} + .cfi_offset d15, -40 + .cfi_offset d14, -48 + .cfi_offset d13, -56 + .cfi_offset d12, -64 + .cfi_offset d11, -72 + .cfi_offset d10, -80 + .cfi_offset d9, -88 + .cfi_offset d8, -96 + .pad #944 + sub.w sp, sp, #944 + mov r4, sp + bfc r4, #0, #4 + mov sp, r4 + mov r10, r0 +.Ltmp0: + .loc 1 735 3 prologue_end @ ../crypto/hrss/hrss.c:735:3 + movs r0, #104 + .loc 1 733 3 @ ../crypto/hrss/hrss.c:733:3 + mov r2, r1 + add.w lr, sp, #704 + vld1.16 {d4, d5}, [r2], r0 + adr r0, .LCPI0_2 + vmov.i8 q14, #0xff + mov.w r5, #700 + vld1.64 {d16, d17}, [r0:128] + adr r0, .LCPI0_3 + vmov.i32 q1, #0x0 + mvn r12, #-2147483648 + vst1.64 {d16, d17}, [lr:128] @ 16-byte Spill + .loc 1 735 3 @ ../crypto/hrss/hrss.c:735:3 + add.w lr, sp, #672 + vmov.i32 q11, #0x0 + mov.w r6, #700 + vld1.64 {d16, d17}, [r0:128] + add.w r0, r1, #152 + vmov.i32 q12, #0x0 + vst1.64 {d16, d17}, [lr:128] @ 16-byte Spill + add.w lr, sp, #384 + vld1.32 {d16, d17}, [r2] + .loc 1 733 3 @ ../crypto/hrss/hrss.c:733:3 + add.w r2, r1, #64 + .loc 1 735 3 @ ../crypto/hrss/hrss.c:735:3 + vst1.64 {d16, d17}, [lr:128] @ 16-byte Spill + add.w lr, sp, #640 + vld1.32 {d16, d17}, [r0] + add.w r0, r1, #136 + vst1.64 {d16, d17}, [lr:128] @ 16-byte Spill + add.w lr, sp, #496 + vld1.32 {d16, d17}, [r0] + add.w r0, r1, #120 + vst1.64 {d16, d17}, [lr:128] @ 16-byte Spill + add.w lr, sp, #432 + vld1.32 {d16, d17}, [r0] + add.w r0, r1, #88 + vst1.64 {d16, d17}, [lr:128] @ 16-byte Spill + vmov.i32 d17, #0x0 + .loc 1 733 3 @ ../crypto/hrss/hrss.c:733:3 + add.w lr, sp, #544 + vld1.32 {d20, d21}, [r2] + add.w r2, r1, #32 + .loc 1 735 3 @ ../crypto/hrss/hrss.c:735:3 + vld1.32 {d30, d31}, [r0] + .loc 1 733 3 @ ../crypto/hrss/hrss.c:733:3 + add.w r0, r1, #16 + vldr d18, [r1, #80] + .loc 1 735 3 @ ../crypto/hrss/hrss.c:735:3 + vldr d16, [r1, #168] + .loc 1 733 3 @ ../crypto/hrss/hrss.c:733:3 + adds r1, #48 + vst1.64 {d20, d21}, [lr:128] @ 16-byte Spill + add.w lr, sp, #416 + vorr d19, d17, d17 + vld1.32 {d20, d21}, [r1] + movs r1, #0 + vst1.64 {d20, d21}, [lr:128] @ 16-byte Spill + add.w lr, sp, #400 + vld1.32 {d20, d21}, [r2] + movw r2, #1399 + vst1.64 {d20, d21}, [lr:128] @ 16-byte Spill + add.w lr, sp, #352 + vld1.32 {d20, d21}, [r0] + add r0, sp, #880 + vst1.64 {d20, d21}, [lr:128] @ 16-byte Spill + add.w lr, sp, #656 + vmov.i8 q10, #0xff + vst1.64 {d16, d17}, [lr:128] @ 16-byte Spill + vmov.i16 q8, #0xf + add.w lr, sp, #624 + vneg.s16 q8, q8 + vst1.64 {d18, d19}, [lr:128] @ 16-byte Spill + add.w lr, sp, #608 + vmov.i8 q9, #0xff + vst1.64 {d16, d17}, [lr:128] @ 16-byte Spill + vmov.i32 q8, #0x0 + mov.w lr, #0 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + vmov.i32 q8, #0x0 + add r0, sp, #896 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + add r0, sp, #592 + vmov.i8 q8, #0xff + vst1.64 {d18, d19}, [r0:128] @ 16-byte Spill + vmov.i8 q9, #0xff + add r0, sp, #576 + vst1.64 {d18, d19}, [r0:128] @ 16-byte Spill + vmov.i8 q9, #0xff + add r0, sp, #560 + vst1.64 {d18, d19}, [r0:128] @ 16-byte Spill + vmov.i32 q9, #0x0 + add r0, sp, #528 + vst1.64 {d18, d19}, [r0:128] @ 16-byte Spill + vmov.i32 q9, #0x0 + add r0, sp, #512 + vst1.64 {d18, d19}, [r0:128] @ 16-byte Spill + vmov.i32 q9, #0x0 + add r0, sp, #480 + vst1.64 {d18, d19}, [r0:128] @ 16-byte Spill + vmov.i32 q9, #0x0 + add r0, sp, #464 + vst1.64 {d18, d19}, [r0:128] @ 16-byte Spill + vmov.i32 q9, #0x0 + add r0, sp, #448 + vst1.64 {d18, d19}, [r0:128] @ 16-byte Spill + vmov.i32 q9, #0x0 + add r0, sp, #208 + vst1.64 {d18, d19}, [r0:128] @ 16-byte Spill + vmov.i32 q9, #0x0 + add r0, sp, #224 + vst1.64 {d18, d19}, [r0:128] @ 16-byte Spill + vmov.i32 q9, #0x0 + add r0, sp, #320 + vst1.64 {d18, d19}, [r0:128] @ 16-byte Spill + vmov.i32 q9, #0x0 + add r0, sp, #288 + vst1.64 {d18, d19}, [r0:128] @ 16-byte Spill + vmov.i32 q9, #0x0 + add r0, sp, #256 + vst1.64 {d18, d19}, [r0:128] @ 16-byte Spill + vmov.i32 q9, #0x0 + add r0, sp, #368 + vst1.64 {d18, d19}, [r0:128] @ 16-byte Spill + vmov.i32 q9, #0x0 + add r0, sp, #336 + vst1.64 {d18, d19}, [r0:128] @ 16-byte Spill + vmov.i32 q9, #0x0 + add r0, sp, #304 + vst1.64 {d18, d19}, [r0:128] @ 16-byte Spill + vmov.i32 q9, #0x0 + add r0, sp, #272 + vst1.64 {d18, d19}, [r0:128] @ 16-byte Spill + vmov.i32 q9, #0x0 + add r0, sp, #240 + vst1.64 {d18, d19}, [r0:128] @ 16-byte Spill + vmov.i32 q9, #0x0 + add r0, sp, #800 + vst1.64 {d18, d19}, [r0:128] @ 16-byte Spill + vmov.i32 q9, #0x0 + add r0, sp, #816 + vst1.64 {d18, d19}, [r0:128] @ 16-byte Spill + vmov.i32 q9, #0x0 + add r0, sp, #832 + vst1.64 {d18, d19}, [r0:128] @ 16-byte Spill + vmov.i32 q9, #0x0 + add r0, sp, #848 + vst1.64 {d18, d19}, [r0:128] @ 16-byte Spill + vmov.i32 q9, #0x0 + add r0, sp, #864 + vst1.64 {d18, d19}, [r0:128] @ 16-byte Spill + vmov.i32 q9, #0x0 + add r0, sp, #688 + vst1.64 {d18, d19}, [r0:128] @ 16-byte Spill + vmov.i32 q9, #0x0 + add r0, sp, #720 + vst1.64 {d18, d19}, [r0:128] @ 16-byte Spill + vmov.i32 q9, #0x0 + add r0, sp, #736 + vst1.64 {d18, d19}, [r0:128] @ 16-byte Spill + vmov.i32 q9, #0x0 + add r0, sp, #752 + vst1.64 {d18, d19}, [r0:128] @ 16-byte Spill + vmov.i32 q9, #0x0 + add r0, sp, #784 + vst1.64 {d18, d19}, [r0:128] @ 16-byte Spill + vmov.i32 q9, #0x0 + .loc 1 747 3 @ ../crypto/hrss/hrss.c:747:3 + add r0, sp, #768 + vst1.64 {d18, d19}, [r0:128] @ 16-byte Spill + b .LBB0_3 + .p2align 4 +@ %bb.1: + .loc 1 0 3 is_stmt 0 @ ../crypto/hrss/hrss.c:0:3 +.LCPI0_2: + .short 1 @ 0x1 + .short 0 @ 0x0 + .short 0 @ 0x0 + .short 0 @ 0x0 + .short 0 @ 0x0 + .short 0 @ 0x0 + .short 0 @ 0x0 + .short 0 @ 0x0 + .p2align 4 +@ %bb.2: +.LCPI0_3: + .short 65535 @ 0xffff + .short 65535 @ 0xffff + .short 65535 @ 0xffff + .short 8191 @ 0x1fff + .short 0 @ 0x0 + .short 0 @ 0x0 + .short 0 @ 0x0 + .short 0 @ 0x0 + .p2align 1 +.LBB0_3: @ =>This Inner Loop Header: Depth=1 + .loc 1 749 32 is_stmt 1 @ ../crypto/hrss/hrss.c:749:32 + add r0, sp, #96 + vand q9, q1, q15 + .loc 1 751 32 @ ../crypto/hrss/hrss.c:751:32 + vand q13, q8, q15 +.Ltmp1: + .file 2 "../crypto/hrss/../internal.h" + .loc 2 270 42 @ ../crypto/hrss/../internal.h:270:42 + subs r4, r5, r6 +.Ltmp2: + .loc 1 749 32 @ ../crypto/hrss/hrss.c:749:32 + vst1.64 {d22, d23}, [r0:128] @ 16-byte Spill + add r0, sp, #128 + .loc 1 749 52 is_stmt 0 @ ../crypto/hrss/hrss.c:749:52 + vand q11, q8, q2 +.Ltmp3: + .loc 2 270 35 is_stmt 1 @ ../crypto/hrss/../internal.h:270:35 + eor.w r3, r5, r6 +.Ltmp4: + .loc 1 749 32 @ ../crypto/hrss/hrss.c:749:32 + vst1.64 {d24, d25}, [r0:128] @ 16-byte Spill + .loc 1 751 52 @ ../crypto/hrss/hrss.c:751:52 + vand q12, q1, q2 + .loc 1 749 42 @ ../crypto/hrss/hrss.c:749:42 + veor q9, q11, q9 +.Ltmp5: + .loc 2 270 45 @ ../crypto/hrss/../internal.h:270:45 + eors r4, r5 +.Ltmp6: + .loc 1 751 42 @ ../crypto/hrss/hrss.c:751:42 + veor q11, q13, q12 +.Ltmp7: + .loc 2 270 38 @ ../crypto/hrss/../internal.h:270:38 + orrs r4, r3 +.Ltmp8: + .loc 1 749 21 @ ../crypto/hrss/hrss.c:749:21 + vand q12, q14, q9 +.Ltmp9: + .loc 2 270 31 @ ../crypto/hrss/../internal.h:270:31 + eors r4, r5 +.Ltmp10: + .loc 1 751 21 @ ../crypto/hrss/hrss.c:751:21 + vand q9, q14, q11 + .loc 1 749 32 @ ../crypto/hrss/hrss.c:749:32 + add r0, sp, #912 +.Ltmp11: + .loc 2 234 13 @ ../crypto/hrss/../internal.h:234:13 + asrs r4, r4, #31 +.Ltmp12: + .loc 1 747 26 @ ../crypto/hrss/hrss.c:747:26 + subs r2, #1 +.Ltmp13: + .loc 1 185 7 @ ../crypto/hrss/hrss.c:185:7 + vorr q11, q9, q12 +.Ltmp14: + .loc 1 153 50 @ ../crypto/hrss/hrss.c:153:50 + vmov.16 d26[0], r4 +.Ltmp15: + .loc 1 185 7 @ ../crypto/hrss/hrss.c:185:7 + vshl.i16 q9, q9, #15 +.Ltmp16: + .loc 1 749 32 @ ../crypto/hrss/hrss.c:749:32 + vst1.64 {d28, d29}, [r0:128] @ 16-byte Spill +.Ltmp17: + .loc 1 685 40 @ ../crypto/hrss/hrss.c:685:40 + add r0, sp, #192 +.Ltmp18: + .loc 1 185 7 @ ../crypto/hrss/hrss.c:185:7 + vshl.i16 q11, q11, #15 +.Ltmp19: + .loc 1 753 14 @ ../crypto/hrss/hrss.c:753:14 + vshr.s16 q11, q11, #15 + .loc 1 753 21 is_stmt 0 @ ../crypto/hrss/hrss.c:753:21 + vand q11, q13, q11 +.Ltmp20: + .loc 1 689 40 is_stmt 1 @ ../crypto/hrss/hrss.c:689:40 + veor q13, q8, q15 +.Ltmp21: + .loc 1 753 21 @ ../crypto/hrss/hrss.c:753:21 + vdup.16 q0, d22[0] +.Ltmp22: + .loc 1 685 40 @ ../crypto/hrss/hrss.c:685:40 + veor q11, q1, q2 + .loc 1 689 30 @ ../crypto/hrss/hrss.c:689:30 + vand q13, q0, q13 + .loc 1 685 30 @ ../crypto/hrss/hrss.c:685:30 + vand q14, q0, q11 + .loc 1 691 12 @ ../crypto/hrss/hrss.c:691:12 + veor q8, q13, q8 + .loc 1 687 12 @ ../crypto/hrss/hrss.c:687:12 + veor q1, q14, q1 +.Ltmp23: + .loc 1 185 7 @ ../crypto/hrss/hrss.c:185:7 + vshl.i16 q11, q12, #15 +.Ltmp24: + .loc 1 685 40 @ ../crypto/hrss/hrss.c:685:40 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill +.Ltmp25: + .loc 1 185 7 @ ../crypto/hrss/hrss.c:185:7 + add r0, sp, #160 + vst1.64 {d2, d3}, [r0:128] @ 16-byte Spill + add r0, sp, #608 + vld1.64 {d6, d7}, [r0:128] @ 16-byte Reload +.Ltmp26: + .loc 1 708 41 @ ../crypto/hrss/hrss.c:708:41 + add r0, sp, #144 +.Ltmp27: + .loc 1 185 7 @ ../crypto/hrss/hrss.c:185:7 + vshl.s16 q9, q9, q3 +.Ltmp28: + .loc 1 185 7 is_stmt 0 @ ../crypto/hrss/hrss.c:185:7 + vshl.s16 q11, q11, q3 +.Ltmp29: + .loc 1 186 10 is_stmt 1 @ ../crypto/hrss/hrss.c:186:10 + vdup.16 q9, d18[0] +.Ltmp30: + .loc 1 186 10 is_stmt 0 @ ../crypto/hrss/hrss.c:186:10 + vdup.16 q3, d22[0] +.Ltmp31: + .loc 1 701 44 is_stmt 1 @ ../crypto/hrss/hrss.c:701:44 + vand q12, q8, q9 + .loc 1 701 32 is_stmt 0 @ ../crypto/hrss/hrss.c:701:32 + vand q11, q1, q3 + .loc 1 701 38 @ ../crypto/hrss/hrss.c:701:38 + veor q5, q12, q11 + .loc 1 702 33 is_stmt 1 @ ../crypto/hrss/hrss.c:702:33 + vand q12, q1, q9 + .loc 1 702 44 is_stmt 0 @ ../crypto/hrss/hrss.c:702:44 + vand q1, q8, q3 + .loc 1 702 38 @ ../crypto/hrss/hrss.c:702:38 + veor q4, q1, q12 +.Ltmp32: + .loc 1 686 12 is_stmt 1 @ ../crypto/hrss/hrss.c:686:12 + veor q1, q14, q2 + .loc 1 690 12 @ ../crypto/hrss/hrss.c:690:12 + veor q14, q13, q15 +.Ltmp33: + .loc 1 708 22 @ ../crypto/hrss/hrss.c:708:22 + vand q2, q5, q1 + .loc 1 705 31 @ ../crypto/hrss/hrss.c:705:31 + vorr q8, q14, q1 + .loc 1 708 41 @ ../crypto/hrss/hrss.c:708:41 + vbic q13, q4, q8 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + .loc 1 706 34 @ ../crypto/hrss/hrss.c:706:34 + vorr q8, q5, q4 + .loc 1 708 60 @ ../crypto/hrss/hrss.c:708:60 + add r0, sp, #80 + .loc 1 708 35 is_stmt 0 @ ../crypto/hrss/hrss.c:708:35 + veor q13, q13, q2 + .loc 1 708 60 @ ../crypto/hrss/hrss.c:708:60 + vbic q2, q14, q8 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + add r0, sp, #384 + .loc 1 708 54 @ ../crypto/hrss/hrss.c:708:54 + veor q6, q13, q2 + vld1.64 {d24, d25}, [r0:128] @ 16-byte Reload +.Ltmp34: + .loc 1 689 30 is_stmt 1 @ ../crypto/hrss/hrss.c:689:30 + add r0, sp, #928 + .loc 1 689 40 is_stmt 0 @ ../crypto/hrss/hrss.c:689:40 + veor q13, q10, q12 + .loc 1 689 30 @ ../crypto/hrss/hrss.c:689:30 + vst1.64 {d0, d1}, [r0:128] @ 16-byte Spill + .loc 1 685 40 is_stmt 1 @ ../crypto/hrss/hrss.c:685:40 + add r0, sp, #176 + .loc 1 689 30 @ ../crypto/hrss/hrss.c:689:30 + vand q13, q0, q13 +.Ltmp35: + .loc 1 707 22 @ ../crypto/hrss/hrss.c:707:22 + vand q14, q4, q14 +.Ltmp36: + .loc 1 691 12 @ ../crypto/hrss/hrss.c:691:12 + veor q10, q13, q10 +.Ltmp37: + .loc 1 701 44 @ ../crypto/hrss/hrss.c:701:44 + vand q15, q10, q9 +.Ltmp38: + .loc 1 685 40 @ ../crypto/hrss/hrss.c:685:40 + vst1.64 {d20, d21}, [r0:128] @ 16-byte Spill + add r0, sp, #528 + vld1.64 {d16, d17}, [r0:128] @ 16-byte Reload + add r0, sp, #352 + vld1.64 {d22, d23}, [r0:128] @ 16-byte Reload +.Ltmp39: + .loc 1 701 44 @ ../crypto/hrss/hrss.c:701:44 + add r0, sp, #16 +.Ltmp40: + .loc 1 685 40 @ ../crypto/hrss/hrss.c:685:40 + veor q2, q8, q11 +.Ltmp41: + .loc 1 701 44 @ ../crypto/hrss/hrss.c:701:44 + vst1.64 {d18, d19}, [r0:128] @ 16-byte Spill + add r0, sp, #528 +.Ltmp42: + .loc 1 685 30 @ ../crypto/hrss/hrss.c:685:30 + vand q2, q0, q2 + .loc 1 687 12 @ ../crypto/hrss/hrss.c:687:12 + veor q8, q2, q8 +.Ltmp43: + .loc 1 701 32 @ ../crypto/hrss/hrss.c:701:32 + vand q7, q8, q3 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill +.Ltmp44: + .loc 1 223 33 @ ../crypto/hrss/hrss.c:223:33 + add r0, sp, #32 +.Ltmp45: + .loc 1 701 38 @ ../crypto/hrss/hrss.c:701:38 + veor q15, q15, q7 + .loc 1 702 33 @ ../crypto/hrss/hrss.c:702:33 + vand q7, q8, q9 + .loc 1 702 44 is_stmt 0 @ ../crypto/hrss/hrss.c:702:44 + vand q8, q10, q3 +.Ltmp46: + .loc 1 690 12 is_stmt 1 @ ../crypto/hrss/hrss.c:690:12 + veor q10, q13, q12 +.Ltmp47: + .loc 1 702 38 @ ../crypto/hrss/hrss.c:702:38 + veor q9, q8, q7 +.Ltmp48: + .loc 1 686 12 @ ../crypto/hrss/hrss.c:686:12 + veor q7, q2, q11 +.Ltmp49: + .loc 1 706 34 @ ../crypto/hrss/hrss.c:706:34 + vorr q12, q15, q9 + .loc 1 705 31 @ ../crypto/hrss/hrss.c:705:31 + vorr q0, q10, q7 + .loc 1 708 22 @ ../crypto/hrss/hrss.c:708:22 + vand q2, q15, q7 + .loc 1 708 41 is_stmt 0 @ ../crypto/hrss/hrss.c:708:41 + vbic q13, q9, q0 +.Ltmp50: + .loc 1 224 12 is_stmt 1 @ ../crypto/hrss/hrss.c:224:12 + vshr.u16 q11, q6, #1 +.Ltmp51: + .loc 1 708 35 @ ../crypto/hrss/hrss.c:708:35 + veor q13, q13, q2 + .loc 1 708 60 is_stmt 0 @ ../crypto/hrss/hrss.c:708:60 + vbic q2, q10, q12 + .loc 1 708 54 @ ../crypto/hrss/hrss.c:708:54 + veor q8, q13, q2 + vmov.i32 q2, #0x0 +.Ltmp52: + .loc 1 223 33 is_stmt 1 @ ../crypto/hrss/hrss.c:223:33 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + vshl.i16 q8, q8, #15 + .loc 1 224 12 @ ../crypto/hrss/hrss.c:224:12 + add r0, sp, #384 + .loc 1 227 15 @ ../crypto/hrss/hrss.c:227:15 + vext.16 q13, q2, q8, #1 + .loc 1 225 12 @ ../crypto/hrss/hrss.c:225:12 + vorr q11, q13, q11 + .loc 1 224 12 @ ../crypto/hrss/hrss.c:224:12 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + .loc 1 223 33 @ ../crypto/hrss/hrss.c:223:33 + vshl.i16 q13, q6, #15 +.Ltmp53: + .loc 1 707 60 @ ../crypto/hrss/hrss.c:707:60 + add r0, sp, #80 + vld1.64 {d16, d17}, [r0:128] @ 16-byte Reload + .loc 1 707 41 is_stmt 0 @ ../crypto/hrss/hrss.c:707:41 + add r0, sp, #144 +.Ltmp54: + .loc 1 225 15 is_stmt 1 @ ../crypto/hrss/hrss.c:225:15 + vext.16 q13, q13, q2, #1 + .loc 1 226 12 @ ../crypto/hrss/hrss.c:226:12 + vorr q6, q11, q13 +.Ltmp55: + .loc 1 707 60 @ ../crypto/hrss/hrss.c:707:60 + vbic q11, q1, q8 + .loc 1 707 41 is_stmt 0 @ ../crypto/hrss/hrss.c:707:41 + vld1.64 {d16, d17}, [r0:128] @ 16-byte Reload + vbic q13, q5, q8 +.Ltmp56: + .loc 1 218 12 is_stmt 1 @ ../crypto/hrss/hrss.c:218:12 + add r0, sp, #352 +.Ltmp57: + .loc 1 707 22 @ ../crypto/hrss/hrss.c:707:22 + vand q8, q9, q10 + .loc 1 707 35 is_stmt 0 @ ../crypto/hrss/hrss.c:707:35 + veor q13, q13, q14 + .loc 1 707 54 @ ../crypto/hrss/hrss.c:707:54 + veor q11, q13, q11 + .loc 1 707 60 @ ../crypto/hrss/hrss.c:707:60 + vbic q13, q7, q12 + .loc 1 707 41 @ ../crypto/hrss/hrss.c:707:41 + vbic q12, q15, q0 + .loc 1 707 35 @ ../crypto/hrss/hrss.c:707:35 + veor q8, q12, q8 + .loc 1 707 54 @ ../crypto/hrss/hrss.c:707:54 + veor q1, q8, q13 +.Ltmp58: + .loc 1 218 12 is_stmt 1 @ ../crypto/hrss/hrss.c:218:12 + vshr.u16 q8, q11, #1 + .loc 1 217 33 @ ../crypto/hrss/hrss.c:217:33 + vshl.i16 q9, q1, #15 + .loc 1 218 12 @ ../crypto/hrss/hrss.c:218:12 + vst1.64 {d18, d19}, [r0:128] @ 16-byte Spill + add r0, sp, #80 + .loc 1 221 15 @ ../crypto/hrss/hrss.c:221:15 + vext.16 q9, q2, q9, #1 + .loc 1 219 12 @ ../crypto/hrss/hrss.c:219:12 + vorr q8, q9, q8 + .loc 1 217 33 @ ../crypto/hrss/hrss.c:217:33 + vshl.i16 q9, q11, #15 + .loc 1 219 15 @ ../crypto/hrss/hrss.c:219:15 + vext.16 q9, q9, q2, #1 + .loc 1 220 12 @ ../crypto/hrss/hrss.c:220:12 + vorr q12, q8, q9 +.Ltmp59: + .loc 1 772 56 @ ../crypto/hrss/hrss.c:772:56 + vorr q8, q12, q6 + vst1.64 {d24, d25}, [r0:128] @ 16-byte Spill + add r0, sp, #112 + vst1.64 {d12, d13}, [r0:128] @ 16-byte Spill +.Ltmp60: + .loc 1 185 7 @ ../crypto/hrss/hrss.c:185:7 + add r0, sp, #608 + vshl.i16 q8, q8, #15 + vld1.64 {d18, d19}, [r0:128] @ 16-byte Reload + add r0, sp, #144 + vshl.s16 q8, q8, q9 + .loc 1 186 10 @ ../crypto/hrss/hrss.c:186:10 + vdup.16 q11, d16[0] + vst1.64 {d22, d23}, [r0:128] @ 16-byte Spill + add r0, sp, #896 + vld1.64 {d20, d21}, [r0:128] @ 16-byte Reload +.Ltmp61: + .loc 1 777 40 @ ../crypto/hrss/hrss.c:777:40 + add r0, sp, #912 + .loc 1 779 65 @ ../crypto/hrss/hrss.c:779:65 + veor q8, q6, q10 + .loc 1 777 40 @ ../crypto/hrss/hrss.c:777:40 + vld1.64 {d18, d19}, [r0:128] @ 16-byte Reload + add r0, sp, #896 + vand q9, q11, q9 + .loc 1 779 55 @ ../crypto/hrss/hrss.c:779:55 + vand q8, q9, q8 + .loc 1 780 9 @ ../crypto/hrss/hrss.c:780:9 + veor q10, q8, q10 + vst1.64 {d20, d21}, [r0:128] @ 16-byte Spill + add r0, sp, #880 + vld1.64 {d20, d21}, [r0:128] @ 16-byte Reload + add r0, sp, #880 + .loc 1 777 65 @ ../crypto/hrss/hrss.c:777:65 + veor q8, q12, q10 + .loc 1 777 55 is_stmt 0 @ ../crypto/hrss/hrss.c:777:55 + vand q8, q9, q8 + .loc 1 778 9 is_stmt 1 @ ../crypto/hrss/hrss.c:778:9 + veor q10, q8, q10 + vst1.64 {d20, d21}, [r0:128] @ 16-byte Spill + add r0, sp, #672 + vld1.64 {d24, d25}, [r0:128] @ 16-byte Reload + add r0, sp, #656 + vld1.64 {d26, d27}, [r0:128] @ 16-byte Reload + add r0, sp, #928 +.Ltmp62: + .loc 1 689 40 @ ../crypto/hrss/hrss.c:689:40 + veor q8, q12, q13 + vld1.64 {d28, d29}, [r0:128] @ 16-byte Reload + add r0, sp, #624 + .loc 1 689 30 is_stmt 0 @ ../crypto/hrss/hrss.c:689:30 + vand q8, q14, q8 + vld1.64 {d30, d31}, [r0:128] @ 16-byte Reload + add r0, sp, #448 + .loc 1 691 12 is_stmt 1 @ ../crypto/hrss/hrss.c:691:12 + veor q12, q8, q12 + vld1.64 {d8, d9}, [r0:128] @ 16-byte Reload + add r0, sp, #16 + .loc 1 685 40 @ ../crypto/hrss/hrss.c:685:40 + veor q9, q4, q15 + vld1.64 {d0, d1}, [r0:128] @ 16-byte Reload + add r0, sp, #448 + .loc 1 685 30 is_stmt 0 @ ../crypto/hrss/hrss.c:685:30 + vand q9, q14, q9 +.Ltmp63: + .loc 1 701 44 is_stmt 1 @ ../crypto/hrss/hrss.c:701:44 + vand q11, q12, q0 +.Ltmp64: + .loc 1 687 12 @ ../crypto/hrss/hrss.c:687:12 + veor q4, q9, q4 + .loc 1 686 12 @ ../crypto/hrss/hrss.c:686:12 + veor q9, q9, q15 +.Ltmp65: + .loc 1 701 32 @ ../crypto/hrss/hrss.c:701:32 + vand q10, q4, q3 + vst1.64 {d8, d9}, [r0:128] @ 16-byte Spill + add r0, sp, #672 + .loc 1 701 38 is_stmt 0 @ ../crypto/hrss/hrss.c:701:38 + veor q10, q11, q10 + vst1.64 {d24, d25}, [r0:128] @ 16-byte Spill +.Ltmp66: + .loc 1 690 12 is_stmt 1 @ ../crypto/hrss/hrss.c:690:12 + veor q8, q8, q13 +.Ltmp67: + .loc 1 702 33 @ ../crypto/hrss/hrss.c:702:33 + vand q11, q4, q0 +.Ltmp68: + .loc 1 225 15 @ ../crypto/hrss/hrss.c:225:15 + add r0, sp, #64 +.Ltmp69: + .loc 1 702 44 @ ../crypto/hrss/hrss.c:702:44 + vand q12, q12, q3 + .loc 1 705 31 @ ../crypto/hrss/hrss.c:705:31 + vorr q13, q8, q9 + .loc 1 702 38 @ ../crypto/hrss/hrss.c:702:38 + veor q11, q12, q11 + .loc 1 708 22 @ ../crypto/hrss/hrss.c:708:22 + vand q15, q10, q9 + .loc 1 708 41 is_stmt 0 @ ../crypto/hrss/hrss.c:708:41 + vbic q12, q11, q13 + .loc 1 708 35 @ ../crypto/hrss/hrss.c:708:35 + veor q12, q12, q15 + .loc 1 706 34 is_stmt 1 @ ../crypto/hrss/hrss.c:706:34 + vorr q15, q10, q11 + .loc 1 707 41 @ ../crypto/hrss/hrss.c:707:41 + vbic q10, q10, q13 + .loc 1 708 60 @ ../crypto/hrss/hrss.c:708:60 + vbic q4, q8, q15 + .loc 1 707 22 @ ../crypto/hrss/hrss.c:707:22 + vand q8, q11, q8 + .loc 1 708 54 @ ../crypto/hrss/hrss.c:708:54 + veor q12, q12, q4 + .loc 1 707 60 @ ../crypto/hrss/hrss.c:707:60 + vbic q9, q9, q15 + .loc 1 707 35 is_stmt 0 @ ../crypto/hrss/hrss.c:707:35 + veor q8, q10, q8 +.Ltmp70: + .loc 1 224 12 is_stmt 1 @ ../crypto/hrss/hrss.c:224:12 + vshr.u16 q4, q12, #1 + .loc 1 223 33 @ ../crypto/hrss/hrss.c:223:33 + vshl.i16 q12, q12, #15 +.Ltmp71: + .loc 1 707 54 @ ../crypto/hrss/hrss.c:707:54 + veor q8, q8, q9 + vmov.i32 q11, #0x0 +.Ltmp72: + .loc 1 225 15 @ ../crypto/hrss/hrss.c:225:15 + vext.16 q5, q12, q2, #1 + .loc 1 218 12 @ ../crypto/hrss/hrss.c:218:12 + vshr.u16 q9, q8, #1 + .loc 1 225 15 @ ../crypto/hrss/hrss.c:225:15 + vst1.64 {d24, d25}, [r0:128] @ 16-byte Spill + .loc 1 226 12 @ ../crypto/hrss/hrss.c:226:12 + vorr q12, q5, q4 +.Ltmp73: + .loc 1 707 60 @ ../crypto/hrss/hrss.c:707:60 + add r0, sp, #656 +.Ltmp74: + .loc 1 217 33 @ ../crypto/hrss/hrss.c:217:33 + vshl.i16 q8, q8, #15 +.Ltmp75: + .loc 1 707 60 @ ../crypto/hrss/hrss.c:707:60 + vst1.64 {d24, d25}, [r0:128] @ 16-byte Spill +.Ltmp76: + .loc 1 219 15 @ ../crypto/hrss/hrss.c:219:15 + add r0, sp, #48 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + .loc 1 224 12 @ ../crypto/hrss/hrss.c:224:12 + add r0, sp, #624 + .loc 1 219 15 @ ../crypto/hrss/hrss.c:219:15 + vext.16 q8, q8, q2, #1 + .loc 1 220 12 @ ../crypto/hrss/hrss.c:220:12 + vorr q8, q8, q9 + .loc 1 224 12 @ ../crypto/hrss/hrss.c:224:12 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + add r0, sp, #32 + vld1.64 {d16, d17}, [r0:128] @ 16-byte Reload + add r0, sp, #592 + vld1.64 {d24, d25}, [r0:128] @ 16-byte Reload + add r0, sp, #432 + vshr.u16 q8, q8, #1 + vld1.64 {d8, d9}, [r0:128] @ 16-byte Reload + add r0, sp, #512 +.Ltmp77: + .loc 1 689 40 @ ../crypto/hrss/hrss.c:689:40 + veor q9, q12, q4 + vld1.64 {d4, d5}, [r0:128] @ 16-byte Reload + add r0, sp, #400 + .loc 1 689 30 is_stmt 0 @ ../crypto/hrss/hrss.c:689:30 + vand q9, q14, q9 + vld1.64 {d10, d11}, [r0:128] @ 16-byte Reload + add r0, sp, #512 + .loc 1 685 40 is_stmt 1 @ ../crypto/hrss/hrss.c:685:40 + veor q10, q2, q5 + .loc 1 691 12 @ ../crypto/hrss/hrss.c:691:12 + veor q12, q9, q12 + .loc 1 685 30 @ ../crypto/hrss/hrss.c:685:30 + vand q10, q14, q10 +.Ltmp78: + .loc 1 701 44 @ ../crypto/hrss/hrss.c:701:44 + vand q15, q12, q0 +.Ltmp79: + .loc 1 687 12 @ ../crypto/hrss/hrss.c:687:12 + veor q2, q10, q2 + .loc 1 686 12 @ ../crypto/hrss/hrss.c:686:12 + veor q10, q10, q5 +.Ltmp80: + .loc 1 701 32 @ ../crypto/hrss/hrss.c:701:32 + vand q13, q2, q3 +.Ltmp81: + .loc 1 690 12 @ ../crypto/hrss/hrss.c:690:12 + veor q9, q9, q4 + vst1.64 {d4, d5}, [r0:128] @ 16-byte Spill +.Ltmp82: + .loc 1 701 38 @ ../crypto/hrss/hrss.c:701:38 + veor q13, q15, q13 + add r0, sp, #592 + .loc 1 702 33 @ ../crypto/hrss/hrss.c:702:33 + vand q15, q2, q0 + .loc 1 702 44 is_stmt 0 @ ../crypto/hrss/hrss.c:702:44 + vand q2, q12, q3 + vst1.64 {d24, d25}, [r0:128] @ 16-byte Spill + .loc 1 705 31 is_stmt 1 @ ../crypto/hrss/hrss.c:705:31 + vorr q4, q9, q10 +.Ltmp83: + .loc 1 225 15 @ ../crypto/hrss/hrss.c:225:15 + add r0, sp, #384 +.Ltmp84: + .loc 1 702 38 @ ../crypto/hrss/hrss.c:702:38 + veor q15, q2, q15 +.Ltmp85: + .loc 1 225 15 @ ../crypto/hrss/hrss.c:225:15 + vld1.64 {d24, d25}, [r0:128] @ 16-byte Reload +.Ltmp86: + .loc 1 708 22 @ ../crypto/hrss/hrss.c:708:22 + vand q5, q13, q10 +.Ltmp87: + .loc 1 218 12 @ ../crypto/hrss/hrss.c:218:12 + add r0, sp, #384 +.Ltmp88: + .loc 1 708 41 @ ../crypto/hrss/hrss.c:708:41 + vbic q2, q15, q4 + .loc 1 708 35 is_stmt 0 @ ../crypto/hrss/hrss.c:708:35 + veor q2, q2, q5 + .loc 1 706 34 is_stmt 1 @ ../crypto/hrss/hrss.c:706:34 + vorr q5, q13, q15 + .loc 1 707 41 @ ../crypto/hrss/hrss.c:707:41 + vbic q13, q13, q4 + .loc 1 708 60 @ ../crypto/hrss/hrss.c:708:60 + vbic q6, q9, q5 + .loc 1 707 22 @ ../crypto/hrss/hrss.c:707:22 + vand q9, q15, q9 + .loc 1 708 54 @ ../crypto/hrss/hrss.c:708:54 + veor q6, q2, q6 + .loc 1 707 60 @ ../crypto/hrss/hrss.c:707:60 + vbic q10, q10, q5 + .loc 1 707 35 is_stmt 0 @ ../crypto/hrss/hrss.c:707:35 + veor q9, q13, q9 +.Ltmp89: + .loc 1 223 33 is_stmt 1 @ ../crypto/hrss/hrss.c:223:33 + vshl.i16 q2, q6, #15 + .loc 1 227 15 @ ../crypto/hrss/hrss.c:227:15 + vext.16 q7, q11, q2, #1 + .loc 1 225 12 @ ../crypto/hrss/hrss.c:225:12 + vorr q8, q7, q8 + .loc 1 225 15 is_stmt 0 @ ../crypto/hrss/hrss.c:225:15 + vext.16 q7, q12, q11, #1 + .loc 1 226 12 is_stmt 1 @ ../crypto/hrss/hrss.c:226:12 + vorr q8, q8, q7 + .loc 1 218 12 @ ../crypto/hrss/hrss.c:218:12 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + vshr.u16 q8, q1, #1 +.Ltmp90: + .loc 1 707 54 @ ../crypto/hrss/hrss.c:707:54 + veor q1, q9, q10 +.Ltmp91: + .loc 1 219 15 @ ../crypto/hrss/hrss.c:219:15 + add r0, sp, #352 + .loc 1 217 33 @ ../crypto/hrss/hrss.c:217:33 + vshl.i16 q13, q1, #15 + .loc 1 221 15 @ ../crypto/hrss/hrss.c:221:15 + vext.16 q9, q11, q13, #1 + .loc 1 219 12 @ ../crypto/hrss/hrss.c:219:12 + vorr q8, q9, q8 + .loc 1 219 15 is_stmt 0 @ ../crypto/hrss/hrss.c:219:15 + vld1.64 {d18, d19}, [r0:128] @ 16-byte Reload + .loc 1 224 12 is_stmt 1 @ ../crypto/hrss/hrss.c:224:12 + add r0, sp, #352 + .loc 1 219 15 @ ../crypto/hrss/hrss.c:219:15 + vext.16 q9, q9, q11, #1 + vmov.i32 q11, #0x0 + .loc 1 220 12 @ ../crypto/hrss/hrss.c:220:12 + vorr q8, q8, q9 + .loc 1 224 12 @ ../crypto/hrss/hrss.c:224:12 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + add r0, sp, #576 + vshr.u16 q8, q6, #1 + vld1.64 {d24, d25}, [r0:128] @ 16-byte Reload + add r0, sp, #496 + vld1.64 {d10, d11}, [r0:128] @ 16-byte Reload + add r0, sp, #480 +.Ltmp92: + .loc 1 689 40 @ ../crypto/hrss/hrss.c:689:40 + veor q9, q12, q5 + vld1.64 {d8, d9}, [r0:128] @ 16-byte Reload + add r0, sp, #416 + .loc 1 689 30 is_stmt 0 @ ../crypto/hrss/hrss.c:689:30 + vand q9, q14, q9 + vld1.64 {d12, d13}, [r0:128] @ 16-byte Reload + add r0, sp, #480 + .loc 1 685 40 is_stmt 1 @ ../crypto/hrss/hrss.c:685:40 + veor q10, q4, q6 + .loc 1 691 12 @ ../crypto/hrss/hrss.c:691:12 + veor q12, q9, q12 + .loc 1 685 30 @ ../crypto/hrss/hrss.c:685:30 + vand q10, q14, q10 +.Ltmp93: + .loc 1 701 44 @ ../crypto/hrss/hrss.c:701:44 + vand q15, q12, q0 +.Ltmp94: + .loc 1 687 12 @ ../crypto/hrss/hrss.c:687:12 + veor q4, q10, q4 + .loc 1 686 12 @ ../crypto/hrss/hrss.c:686:12 + veor q10, q10, q6 +.Ltmp95: + .loc 1 701 32 @ ../crypto/hrss/hrss.c:701:32 + vand q14, q4, q3 +.Ltmp96: + .loc 1 690 12 @ ../crypto/hrss/hrss.c:690:12 + veor q9, q9, q5 + vst1.64 {d8, d9}, [r0:128] @ 16-byte Spill +.Ltmp97: + .loc 1 701 38 @ ../crypto/hrss/hrss.c:701:38 + veor q15, q15, q14 + add r0, sp, #576 + .loc 1 702 33 @ ../crypto/hrss/hrss.c:702:33 + vand q14, q4, q0 + .loc 1 702 44 is_stmt 0 @ ../crypto/hrss/hrss.c:702:44 + vand q4, q12, q3 + vst1.64 {d24, d25}, [r0:128] @ 16-byte Spill + .loc 1 705 31 is_stmt 1 @ ../crypto/hrss/hrss.c:705:31 + vorr q5, q9, q10 +.Ltmp98: + .loc 1 218 12 @ ../crypto/hrss/hrss.c:218:12 + add r0, sp, #432 +.Ltmp99: + .loc 1 702 38 @ ../crypto/hrss/hrss.c:702:38 + veor q4, q4, q14 + .loc 1 708 22 @ ../crypto/hrss/hrss.c:708:22 + vand q6, q15, q10 + .loc 1 708 41 is_stmt 0 @ ../crypto/hrss/hrss.c:708:41 + vbic q14, q4, q5 + .loc 1 708 35 @ ../crypto/hrss/hrss.c:708:35 + veor q14, q14, q6 + .loc 1 706 34 is_stmt 1 @ ../crypto/hrss/hrss.c:706:34 + vorr q6, q15, q4 + .loc 1 708 60 @ ../crypto/hrss/hrss.c:708:60 + vbic q7, q9, q6 + .loc 1 707 22 @ ../crypto/hrss/hrss.c:707:22 + vand q9, q4, q9 + .loc 1 708 54 @ ../crypto/hrss/hrss.c:708:54 + veor q7, q14, q7 + .loc 1 707 60 @ ../crypto/hrss/hrss.c:707:60 + vbic q10, q10, q6 +.Ltmp100: + .loc 1 223 33 @ ../crypto/hrss/hrss.c:223:33 + vshl.i16 q14, q7, #15 + .loc 1 227 15 @ ../crypto/hrss/hrss.c:227:15 + vext.16 q12, q11, q14, #1 + .loc 1 225 12 @ ../crypto/hrss/hrss.c:225:12 + vorr q8, q12, q8 + .loc 1 225 15 is_stmt 0 @ ../crypto/hrss/hrss.c:225:15 + vext.16 q12, q2, q11, #1 + .loc 1 226 12 is_stmt 1 @ ../crypto/hrss/hrss.c:226:12 + vorr q8, q8, q12 + .loc 1 225 15 @ ../crypto/hrss/hrss.c:225:15 + vext.16 q14, q14, q11, #1 +.Ltmp101: + .loc 1 707 41 @ ../crypto/hrss/hrss.c:707:41 + vbic q12, q15, q5 + .loc 1 707 35 is_stmt 0 @ ../crypto/hrss/hrss.c:707:35 + veor q9, q12, q9 +.Ltmp102: + .loc 1 218 12 is_stmt 1 @ ../crypto/hrss/hrss.c:218:12 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + vshr.u16 q8, q1, #1 + .loc 1 224 12 @ ../crypto/hrss/hrss.c:224:12 + add r0, sp, #400 +.Ltmp103: + .loc 1 707 54 @ ../crypto/hrss/hrss.c:707:54 + veor q2, q9, q10 +.Ltmp104: + .loc 1 217 33 @ ../crypto/hrss/hrss.c:217:33 + vshl.i16 q1, q2, #15 + .loc 1 221 15 @ ../crypto/hrss/hrss.c:221:15 + vext.16 q9, q11, q1, #1 + .loc 1 219 12 @ ../crypto/hrss/hrss.c:219:12 + vorr q8, q9, q8 + .loc 1 219 15 is_stmt 0 @ ../crypto/hrss/hrss.c:219:15 + vext.16 q9, q13, q11, #1 + .loc 1 220 12 is_stmt 1 @ ../crypto/hrss/hrss.c:220:12 + vorr q8, q8, q9 + .loc 1 224 12 @ ../crypto/hrss/hrss.c:224:12 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + add r0, sp, #640 + vshr.u16 q8, q7, #1 + vld1.64 {d8, d9}, [r0:128] @ 16-byte Reload + add r0, sp, #560 + vld1.64 {d30, d31}, [r0:128] @ 16-byte Reload + add r0, sp, #928 +.Ltmp105: + .loc 1 689 40 @ ../crypto/hrss/hrss.c:689:40 + veor q9, q15, q4 + vld1.64 {d24, d25}, [r0:128] @ 16-byte Reload + add r0, sp, #544 + .loc 1 689 30 is_stmt 0 @ ../crypto/hrss/hrss.c:689:30 + vand q9, q12, q9 + vld1.64 {d10, d11}, [r0:128] @ 16-byte Reload + add r0, sp, #464 + .loc 1 691 12 is_stmt 1 @ ../crypto/hrss/hrss.c:691:12 + veor q15, q9, q15 + vld1.64 {d12, d13}, [r0:128] @ 16-byte Reload + add r0, sp, #464 + .loc 1 685 40 @ ../crypto/hrss/hrss.c:685:40 + veor q10, q6, q5 +.Ltmp106: + .loc 1 701 44 @ ../crypto/hrss/hrss.c:701:44 + vand q13, q15, q0 +.Ltmp107: + .loc 1 685 30 @ ../crypto/hrss/hrss.c:685:30 + vand q10, q12, q10 + .loc 1 690 12 @ ../crypto/hrss/hrss.c:690:12 + veor q9, q9, q4 + .loc 1 687 12 @ ../crypto/hrss/hrss.c:687:12 + veor q6, q10, q6 + .loc 1 686 12 @ ../crypto/hrss/hrss.c:686:12 + veor q10, q10, q5 +.Ltmp108: + .loc 1 701 32 @ ../crypto/hrss/hrss.c:701:32 + vand q12, q6, q3 + vst1.64 {d12, d13}, [r0:128] @ 16-byte Spill + add r0, sp, #560 + .loc 1 701 38 is_stmt 0 @ ../crypto/hrss/hrss.c:701:38 + veor q12, q13, q12 + vst1.64 {d30, d31}, [r0:128] @ 16-byte Spill + .loc 1 702 33 is_stmt 1 @ ../crypto/hrss/hrss.c:702:33 + vand q13, q6, q0 + .loc 1 702 44 is_stmt 0 @ ../crypto/hrss/hrss.c:702:44 + vand q15, q15, q3 +.Ltmp109: + .loc 1 218 12 is_stmt 1 @ ../crypto/hrss/hrss.c:218:12 + add r0, sp, #496 +.Ltmp110: + .loc 1 708 22 @ ../crypto/hrss/hrss.c:708:22 + vand q5, q12, q10 + .loc 1 702 38 @ ../crypto/hrss/hrss.c:702:38 + veor q13, q15, q13 + .loc 1 705 31 @ ../crypto/hrss/hrss.c:705:31 + vorr q15, q9, q10 + .loc 1 708 41 @ ../crypto/hrss/hrss.c:708:41 + vbic q4, q13, q15 + .loc 1 708 35 is_stmt 0 @ ../crypto/hrss/hrss.c:708:35 + veor q4, q4, q5 + .loc 1 706 34 is_stmt 1 @ ../crypto/hrss/hrss.c:706:34 + vorr q5, q12, q13 + .loc 1 707 41 @ ../crypto/hrss/hrss.c:707:41 + vbic q12, q12, q15 + .loc 1 708 60 @ ../crypto/hrss/hrss.c:708:60 + vbic q6, q9, q5 + .loc 1 707 22 @ ../crypto/hrss/hrss.c:707:22 + vand q9, q13, q9 + .loc 1 708 54 @ ../crypto/hrss/hrss.c:708:54 + veor q4, q4, q6 + .loc 1 707 60 @ ../crypto/hrss/hrss.c:707:60 + vbic q10, q10, q5 + .loc 1 707 35 is_stmt 0 @ ../crypto/hrss/hrss.c:707:35 + veor q9, q12, q9 +.Ltmp111: + .loc 1 223 33 is_stmt 1 @ ../crypto/hrss/hrss.c:223:33 + vshl.i16 q6, q4, #15 +.Ltmp112: + .loc 1 707 54 @ ../crypto/hrss/hrss.c:707:54 + veor q9, q9, q10 + vmov.i32 q13, #0x0 +.Ltmp113: + .loc 1 227 15 @ ../crypto/hrss/hrss.c:227:15 + vext.16 q7, q11, q6, #1 + vmov.i32 q11, #0x0 + .loc 1 217 33 @ ../crypto/hrss/hrss.c:217:33 + vshl.i16 q10, q9, #15 + .loc 1 225 12 @ ../crypto/hrss/hrss.c:225:12 + vorr q8, q7, q8 + .loc 1 218 12 @ ../crypto/hrss/hrss.c:218:12 + vshr.u16 q9, q9, #1 + .loc 1 226 12 @ ../crypto/hrss/hrss.c:226:12 + vorr q8, q8, q14 + .loc 1 221 15 @ ../crypto/hrss/hrss.c:221:15 + vext.16 q12, q13, q10, #1 + .loc 1 218 12 @ ../crypto/hrss/hrss.c:218:12 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + vshr.u16 q8, q2, #1 + .loc 1 227 15 @ ../crypto/hrss/hrss.c:227:15 + add r0, sp, #416 + .loc 1 219 12 @ ../crypto/hrss/hrss.c:219:12 + vorr q8, q12, q8 + .loc 1 219 15 is_stmt 0 @ ../crypto/hrss/hrss.c:219:15 + vext.16 q12, q1, q13, #1 + .loc 1 220 12 is_stmt 1 @ ../crypto/hrss/hrss.c:220:12 + vorr q8, q8, q12 + .loc 1 224 12 @ ../crypto/hrss/hrss.c:224:12 + vshr.u16 q12, q4, #1 + .loc 1 227 15 @ ../crypto/hrss/hrss.c:227:15 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + add r0, sp, #64 + vld1.64 {d16, d17}, [r0:128] @ 16-byte Reload + .loc 1 221 15 @ ../crypto/hrss/hrss.c:221:15 + add r0, sp, #640 + .loc 1 227 15 @ ../crypto/hrss/hrss.c:227:15 + vext.16 q8, q13, q8, #1 + .loc 1 225 12 @ ../crypto/hrss/hrss.c:225:12 + vorr q8, q8, q12 + .loc 1 225 15 is_stmt 0 @ ../crypto/hrss/hrss.c:225:15 + vext.16 q12, q6, q13, #1 + .loc 1 226 12 is_stmt 1 @ ../crypto/hrss/hrss.c:226:12 + vorr q8, q8, q12 + .loc 1 221 15 @ ../crypto/hrss/hrss.c:221:15 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + add r0, sp, #48 + vld1.64 {d16, d17}, [r0:128] @ 16-byte Reload +.Ltmp114: + .loc 1 689 40 @ ../crypto/hrss/hrss.c:689:40 + add r0, sp, #544 +.Ltmp115: + .loc 1 221 15 @ ../crypto/hrss/hrss.c:221:15 + vext.16 q8, q13, q8, #1 + .loc 1 219 12 @ ../crypto/hrss/hrss.c:219:12 + vorr q8, q8, q9 + .loc 1 219 15 is_stmt 0 @ ../crypto/hrss/hrss.c:219:15 + vext.16 q9, q10, q11, #1 + .loc 1 220 12 is_stmt 1 @ ../crypto/hrss/hrss.c:220:12 + vorr q8, q8, q9 +.Ltmp116: + .loc 1 689 40 @ ../crypto/hrss/hrss.c:689:40 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + add r0, sp, #864 + vld1.64 {d26, d27}, [r0:128] @ 16-byte Reload + add r0, sp, #256 + vld1.64 {d18, d19}, [r0:128] @ 16-byte Reload + add r0, sp, #928 + veor q8, q9, q13 + vld1.64 {d14, d15}, [r0:128] @ 16-byte Reload + add r0, sp, #256 + .loc 1 689 30 is_stmt 0 @ ../crypto/hrss/hrss.c:689:30 + vand q8, q7, q8 +.Ltmp117: + .loc 1 714 3 is_stmt 1 @ ../crypto/hrss/hrss.c:714:3 + vmov.32 r4, d14[0] +.Ltmp118: + .loc 1 691 12 @ ../crypto/hrss/hrss.c:691:12 + veor q15, q8, q9 + .loc 1 690 12 @ ../crypto/hrss/hrss.c:690:12 + veor q8, q8, q13 +.Ltmp119: + .loc 1 701 44 @ ../crypto/hrss/hrss.c:701:44 + vand q11, q15, q0 + vst1.64 {d30, d31}, [r0:128] @ 16-byte Spill + add r0, sp, #768 + vld1.64 {d28, d29}, [r0:128] @ 16-byte Reload + add r0, sp, #240 + vld1.64 {d20, d21}, [r0:128] @ 16-byte Reload + .loc 1 701 32 is_stmt 0 @ ../crypto/hrss/hrss.c:701:32 + add r0, sp, #240 +.Ltmp120: + .loc 1 685 40 is_stmt 1 @ ../crypto/hrss/hrss.c:685:40 + veor q9, q10, q14 + .loc 1 685 30 is_stmt 0 @ ../crypto/hrss/hrss.c:685:30 + vand q9, q7, q9 + .loc 1 687 12 is_stmt 1 @ ../crypto/hrss/hrss.c:687:12 + veor q12, q9, q10 + .loc 1 686 12 @ ../crypto/hrss/hrss.c:686:12 + veor q9, q9, q14 +.Ltmp121: + .loc 1 701 32 @ ../crypto/hrss/hrss.c:701:32 + vand q10, q12, q3 + vst1.64 {d24, d25}, [r0:128] @ 16-byte Spill + .loc 1 707 60 @ ../crypto/hrss/hrss.c:707:60 + add r0, sp, #864 + .loc 1 701 38 @ ../crypto/hrss/hrss.c:701:38 + veor q10, q11, q10 + .loc 1 702 33 @ ../crypto/hrss/hrss.c:702:33 + vand q11, q12, q0 + .loc 1 702 44 is_stmt 0 @ ../crypto/hrss/hrss.c:702:44 + vand q12, q15, q3 +.Ltmp122: + .loc 1 766 52 is_stmt 1 @ ../crypto/hrss/hrss.c:766:52 + and.w r3, r3, r4 +.Ltmp123: + .loc 1 708 22 @ ../crypto/hrss/hrss.c:708:22 + vand q14, q10, q9 +.Ltmp124: + .loc 1 767 11 @ ../crypto/hrss/hrss.c:767:11 + eor.w r5, r5, r3 +.Ltmp125: + .loc 1 702 38 @ ../crypto/hrss/hrss.c:702:38 + veor q11, q12, q11 +.Ltmp126: + .loc 2 304 30 @ ../crypto/hrss/../internal.h:304:30 + add.w r4, r5, r12 +.Ltmp127: + .loc 1 705 31 @ ../crypto/hrss/hrss.c:705:31 + vorr q12, q8, q9 +.Ltmp128: + .loc 1 768 11 @ ../crypto/hrss/hrss.c:768:11 + eor.w r6, r6, r3 +.Ltmp129: + .loc 1 708 41 @ ../crypto/hrss/hrss.c:708:41 + vbic q13, q11, q12 + .loc 1 708 35 is_stmt 0 @ ../crypto/hrss/hrss.c:708:35 + veor q13, q13, q14 + .loc 1 706 34 is_stmt 1 @ ../crypto/hrss/hrss.c:706:34 + vorr q14, q10, q11 + .loc 1 707 41 @ ../crypto/hrss/hrss.c:707:41 + vbic q10, q10, q12 + .loc 1 708 60 @ ../crypto/hrss/hrss.c:708:60 + vbic q15, q8, q14 + .loc 1 707 22 @ ../crypto/hrss/hrss.c:707:22 + vand q8, q11, q8 + .loc 1 708 54 @ ../crypto/hrss/hrss.c:708:54 + veor q13, q13, q15 + .loc 1 707 60 @ ../crypto/hrss/hrss.c:707:60 + vbic q9, q9, q14 + .loc 1 707 35 is_stmt 0 @ ../crypto/hrss/hrss.c:707:35 + veor q8, q10, q8 + .loc 1 707 60 @ ../crypto/hrss/hrss.c:707:60 + vst1.64 {d26, d27}, [r0:128] @ 16-byte Spill + add r0, sp, #768 + .loc 1 707 54 @ ../crypto/hrss/hrss.c:707:54 + veor q8, q8, q9 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + add r0, sp, #848 + vld1.64 {d28, d29}, [r0:128] @ 16-byte Reload + add r0, sp, #288 + vld1.64 {d20, d21}, [r0:128] @ 16-byte Reload + add r0, sp, #288 +.Ltmp130: + .loc 1 689 40 is_stmt 1 @ ../crypto/hrss/hrss.c:689:40 + veor q8, q10, q14 + .loc 1 689 30 is_stmt 0 @ ../crypto/hrss/hrss.c:689:30 + vand q9, q7, q8 + .loc 1 691 12 is_stmt 1 @ ../crypto/hrss/hrss.c:691:12 + veor q13, q9, q10 + .loc 1 690 12 @ ../crypto/hrss/hrss.c:690:12 + veor q9, q9, q14 +.Ltmp131: + .loc 1 701 44 @ ../crypto/hrss/hrss.c:701:44 + vand q12, q13, q0 + vst1.64 {d26, d27}, [r0:128] @ 16-byte Spill + add r0, sp, #784 + .loc 1 702 44 @ ../crypto/hrss/hrss.c:702:44 + vand q13, q13, q3 + vld1.64 {d30, d31}, [r0:128] @ 16-byte Reload + add r0, sp, #272 + vld1.64 {d22, d23}, [r0:128] @ 16-byte Reload + .loc 1 701 32 @ ../crypto/hrss/hrss.c:701:32 + add r0, sp, #272 +.Ltmp132: + .loc 1 685 40 @ ../crypto/hrss/hrss.c:685:40 + veor q8, q11, q15 + .loc 1 685 30 is_stmt 0 @ ../crypto/hrss/hrss.c:685:30 + vand q10, q7, q8 + .loc 1 687 12 is_stmt 1 @ ../crypto/hrss/hrss.c:687:12 + veor q8, q10, q11 + .loc 1 686 12 @ ../crypto/hrss/hrss.c:686:12 + veor q10, q10, q15 +.Ltmp133: + .loc 1 701 32 @ ../crypto/hrss/hrss.c:701:32 + vand q11, q8, q3 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + .loc 1 707 60 @ ../crypto/hrss/hrss.c:707:60 + add r0, sp, #848 + .loc 1 701 38 @ ../crypto/hrss/hrss.c:701:38 + veor q11, q12, q11 + .loc 1 702 33 @ ../crypto/hrss/hrss.c:702:33 + vand q12, q8, q0 + .loc 1 708 22 @ ../crypto/hrss/hrss.c:708:22 + vand q15, q11, q10 + .loc 1 702 38 @ ../crypto/hrss/hrss.c:702:38 + veor q12, q13, q12 + .loc 1 705 31 @ ../crypto/hrss/hrss.c:705:31 + vorr q13, q9, q10 + .loc 1 708 41 @ ../crypto/hrss/hrss.c:708:41 + vbic q14, q12, q13 + .loc 1 708 35 is_stmt 0 @ ../crypto/hrss/hrss.c:708:35 + veor q14, q14, q15 + .loc 1 706 34 is_stmt 1 @ ../crypto/hrss/hrss.c:706:34 + vorr q15, q11, q12 + .loc 1 707 41 @ ../crypto/hrss/hrss.c:707:41 + vbic q11, q11, q13 + .loc 1 708 60 @ ../crypto/hrss/hrss.c:708:60 + vbic q4, q9, q15 + .loc 1 707 22 @ ../crypto/hrss/hrss.c:707:22 + vand q9, q12, q9 + .loc 1 708 54 @ ../crypto/hrss/hrss.c:708:54 + veor q8, q14, q4 + .loc 1 707 60 @ ../crypto/hrss/hrss.c:707:60 + vbic q10, q10, q15 + .loc 1 707 35 is_stmt 0 @ ../crypto/hrss/hrss.c:707:35 + veor q9, q11, q9 + .loc 1 707 60 @ ../crypto/hrss/hrss.c:707:60 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + add r0, sp, #784 + .loc 1 707 54 @ ../crypto/hrss/hrss.c:707:54 + veor q8, q9, q10 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + add r0, sp, #832 + vld1.64 {d16, d17}, [r0:128] @ 16-byte Reload + add r0, sp, #320 + vld1.64 {d20, d21}, [r0:128] @ 16-byte Reload + add r0, sp, #752 +.Ltmp134: + .loc 1 689 40 is_stmt 1 @ ../crypto/hrss/hrss.c:689:40 + veor q9, q10, q8 + vld1.64 {d30, d31}, [r0:128] @ 16-byte Reload + add r0, sp, #304 + .loc 1 689 30 is_stmt 0 @ ../crypto/hrss/hrss.c:689:30 + vand q9, q7, q9 + vld1.64 {d22, d23}, [r0:128] @ 16-byte Reload +.Ltmp135: + .loc 1 701 32 is_stmt 1 @ ../crypto/hrss/hrss.c:701:32 + add r0, sp, #304 +.Ltmp136: + .loc 1 691 12 @ ../crypto/hrss/hrss.c:691:12 + veor q12, q9, q10 + .loc 1 685 40 @ ../crypto/hrss/hrss.c:685:40 + veor q10, q11, q15 +.Ltmp137: + .loc 1 701 44 @ ../crypto/hrss/hrss.c:701:44 + vand q13, q12, q0 +.Ltmp138: + .loc 1 685 30 @ ../crypto/hrss/hrss.c:685:30 + vand q10, q7, q10 + .loc 1 690 12 @ ../crypto/hrss/hrss.c:690:12 + veor q9, q9, q8 + .loc 1 687 12 @ ../crypto/hrss/hrss.c:687:12 + veor q14, q10, q11 + .loc 1 686 12 @ ../crypto/hrss/hrss.c:686:12 + veor q10, q10, q15 +.Ltmp139: + .loc 1 701 32 @ ../crypto/hrss/hrss.c:701:32 + vand q11, q14, q3 + vst1.64 {d28, d29}, [r0:128] @ 16-byte Spill + .loc 1 707 60 @ ../crypto/hrss/hrss.c:707:60 + add r0, sp, #832 + .loc 1 701 38 @ ../crypto/hrss/hrss.c:701:38 + veor q11, q13, q11 + .loc 1 702 33 @ ../crypto/hrss/hrss.c:702:33 + vand q13, q14, q0 + .loc 1 702 44 is_stmt 0 @ ../crypto/hrss/hrss.c:702:44 + vand q14, q12, q3 + .loc 1 708 22 is_stmt 1 @ ../crypto/hrss/hrss.c:708:22 + vand q4, q11, q10 + .loc 1 702 38 @ ../crypto/hrss/hrss.c:702:38 + veor q13, q14, q13 + .loc 1 705 31 @ ../crypto/hrss/hrss.c:705:31 + vorr q14, q9, q10 + .loc 1 708 41 @ ../crypto/hrss/hrss.c:708:41 + vbic q15, q13, q14 + .loc 1 708 35 is_stmt 0 @ ../crypto/hrss/hrss.c:708:35 + veor q15, q15, q4 + .loc 1 706 34 is_stmt 1 @ ../crypto/hrss/hrss.c:706:34 + vorr q4, q11, q13 + .loc 1 707 41 @ ../crypto/hrss/hrss.c:707:41 + vbic q11, q11, q14 + .loc 1 708 60 @ ../crypto/hrss/hrss.c:708:60 + vbic q5, q9, q4 + .loc 1 707 22 @ ../crypto/hrss/hrss.c:707:22 + vand q9, q13, q9 + .loc 1 708 54 @ ../crypto/hrss/hrss.c:708:54 + veor q8, q15, q5 + .loc 1 707 60 @ ../crypto/hrss/hrss.c:707:60 + vbic q10, q10, q4 + .loc 1 707 35 is_stmt 0 @ ../crypto/hrss/hrss.c:707:35 + veor q9, q11, q9 + .loc 1 707 60 @ ../crypto/hrss/hrss.c:707:60 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + add r0, sp, #752 + .loc 1 707 54 @ ../crypto/hrss/hrss.c:707:54 + veor q8, q9, q10 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + add r0, sp, #816 + vld1.64 {d16, d17}, [r0:128] @ 16-byte Reload + add r0, sp, #224 + vld1.64 {d20, d21}, [r0:128] @ 16-byte Reload + add r0, sp, #736 +.Ltmp140: + .loc 1 689 40 is_stmt 1 @ ../crypto/hrss/hrss.c:689:40 + veor q9, q10, q8 + vld1.64 {d2, d3}, [r0:128] @ 16-byte Reload + add r0, sp, #336 + .loc 1 689 30 is_stmt 0 @ ../crypto/hrss/hrss.c:689:30 + vand q9, q7, q9 + vld1.64 {d22, d23}, [r0:128] @ 16-byte Reload +.Ltmp141: + .loc 1 707 60 is_stmt 1 @ ../crypto/hrss/hrss.c:707:60 + add r0, sp, #816 +.Ltmp142: + .loc 1 691 12 @ ../crypto/hrss/hrss.c:691:12 + veor q14, q9, q10 + .loc 1 685 40 @ ../crypto/hrss/hrss.c:685:40 + veor q10, q11, q1 +.Ltmp143: + .loc 1 701 44 @ ../crypto/hrss/hrss.c:701:44 + vand q15, q14, q0 +.Ltmp144: + .loc 1 685 30 @ ../crypto/hrss/hrss.c:685:30 + vand q10, q7, q10 +.Ltmp145: + .loc 1 702 44 @ ../crypto/hrss/hrss.c:702:44 + vand q4, q14, q3 +.Ltmp146: + .loc 1 687 12 @ ../crypto/hrss/hrss.c:687:12 + veor q11, q10, q11 + .loc 1 686 12 @ ../crypto/hrss/hrss.c:686:12 + veor q10, q10, q1 +.Ltmp147: + .loc 1 701 32 @ ../crypto/hrss/hrss.c:701:32 + vand q13, q11, q3 +.Ltmp148: + .loc 1 690 12 @ ../crypto/hrss/hrss.c:690:12 + veor q9, q9, q8 +.Ltmp149: + .loc 1 701 38 @ ../crypto/hrss/hrss.c:701:38 + veor q13, q15, q13 + .loc 1 702 33 @ ../crypto/hrss/hrss.c:702:33 + vand q15, q11, q0 + .loc 1 708 22 @ ../crypto/hrss/hrss.c:708:22 + vand q1, q13, q10 + .loc 1 702 38 @ ../crypto/hrss/hrss.c:702:38 + veor q15, q4, q15 + .loc 1 705 31 @ ../crypto/hrss/hrss.c:705:31 + vorr q4, q9, q10 + .loc 1 708 41 @ ../crypto/hrss/hrss.c:708:41 + vbic q5, q15, q4 + .loc 1 708 35 is_stmt 0 @ ../crypto/hrss/hrss.c:708:35 + veor q1, q5, q1 + .loc 1 706 34 is_stmt 1 @ ../crypto/hrss/hrss.c:706:34 + vorr q5, q13, q15 + .loc 1 707 41 @ ../crypto/hrss/hrss.c:707:41 + vbic q13, q13, q4 + .loc 1 708 60 @ ../crypto/hrss/hrss.c:708:60 + vbic q2, q9, q5 + .loc 1 707 22 @ ../crypto/hrss/hrss.c:707:22 + vand q9, q15, q9 + .loc 1 708 54 @ ../crypto/hrss/hrss.c:708:54 + veor q8, q1, q2 + .loc 1 707 60 @ ../crypto/hrss/hrss.c:707:60 + vbic q10, q10, q5 + .loc 1 707 35 is_stmt 0 @ ../crypto/hrss/hrss.c:707:35 + veor q9, q13, q9 + .loc 1 707 60 @ ../crypto/hrss/hrss.c:707:60 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + add r0, sp, #736 + .loc 1 707 54 @ ../crypto/hrss/hrss.c:707:54 + veor q8, q9, q10 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + add r0, sp, #800 + vld1.64 {d16, d17}, [r0:128] @ 16-byte Reload + add r0, sp, #96 + vld1.64 {d20, d21}, [r0:128] @ 16-byte Reload + add r0, sp, #720 +.Ltmp150: + .loc 1 689 40 is_stmt 1 @ ../crypto/hrss/hrss.c:689:40 + veor q9, q10, q8 + vld1.64 {d10, d11}, [r0:128] @ 16-byte Reload + add r0, sp, #128 + .loc 1 689 30 is_stmt 0 @ ../crypto/hrss/hrss.c:689:30 + vand q9, q7, q9 + vld1.64 {d26, d27}, [r0:128] @ 16-byte Reload +.Ltmp151: + .loc 1 707 60 is_stmt 1 @ ../crypto/hrss/hrss.c:707:60 + add r0, sp, #800 +.Ltmp152: + .loc 1 691 12 @ ../crypto/hrss/hrss.c:691:12 + veor q4, q9, q10 + .loc 1 685 40 @ ../crypto/hrss/hrss.c:685:40 + veor q10, q13, q5 +.Ltmp153: + .loc 1 701 44 @ ../crypto/hrss/hrss.c:701:44 + vand q1, q4, q0 +.Ltmp154: + .loc 1 685 30 @ ../crypto/hrss/hrss.c:685:30 + vand q10, q7, q10 +.Ltmp155: + .loc 1 702 44 @ ../crypto/hrss/hrss.c:702:44 + vand q2, q4, q3 +.Ltmp156: + .loc 1 687 12 @ ../crypto/hrss/hrss.c:687:12 + veor q13, q10, q13 + .loc 1 686 12 @ ../crypto/hrss/hrss.c:686:12 + veor q10, q10, q5 +.Ltmp157: + .loc 1 701 32 @ ../crypto/hrss/hrss.c:701:32 + vand q15, q13, q3 +.Ltmp158: + .loc 1 690 12 @ ../crypto/hrss/hrss.c:690:12 + veor q9, q9, q8 +.Ltmp159: + .loc 1 701 38 @ ../crypto/hrss/hrss.c:701:38 + veor q15, q1, q15 + .loc 1 702 33 @ ../crypto/hrss/hrss.c:702:33 + vand q1, q13, q0 + .loc 1 708 22 @ ../crypto/hrss/hrss.c:708:22 + vand q8, q15, q10 + .loc 1 702 38 @ ../crypto/hrss/hrss.c:702:38 + veor q1, q2, q1 + .loc 1 705 31 @ ../crypto/hrss/hrss.c:705:31 + vorr q2, q9, q10 + .loc 1 708 41 @ ../crypto/hrss/hrss.c:708:41 + vbic q5, q1, q2 + .loc 1 708 35 is_stmt 0 @ ../crypto/hrss/hrss.c:708:35 + veor q8, q5, q8 + .loc 1 706 34 is_stmt 1 @ ../crypto/hrss/hrss.c:706:34 + vorr q5, q15, q1 + .loc 1 708 60 @ ../crypto/hrss/hrss.c:708:60 + vbic q6, q9, q5 + .loc 1 707 22 @ ../crypto/hrss/hrss.c:707:22 + vand q9, q1, q9 + .loc 1 708 54 @ ../crypto/hrss/hrss.c:708:54 + veor q8, q8, q6 + .loc 1 707 60 @ ../crypto/hrss/hrss.c:707:60 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + vbic q8, q10, q5 + .loc 1 707 41 is_stmt 0 @ ../crypto/hrss/hrss.c:707:41 + vbic q10, q15, q2 +.Ltmp160: + .loc 1 689 40 is_stmt 1 @ ../crypto/hrss/hrss.c:689:40 + add r0, sp, #720 +.Ltmp161: + .loc 1 707 35 @ ../crypto/hrss/hrss.c:707:35 + veor q9, q10, q9 + .loc 1 707 54 is_stmt 0 @ ../crypto/hrss/hrss.c:707:54 + veor q8, q9, q8 +.Ltmp162: + .loc 1 689 40 is_stmt 1 @ ../crypto/hrss/hrss.c:689:40 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + add r0, sp, #704 + vld1.64 {d10, d11}, [r0:128] @ 16-byte Reload + add r0, sp, #208 + vld1.64 {d18, d19}, [r0:128] @ 16-byte Reload + add r0, sp, #368 + veor q8, q9, q5 + vld1.64 {d30, d31}, [r0:128] @ 16-byte Reload + add r0, sp, #688 + .loc 1 689 30 is_stmt 0 @ ../crypto/hrss/hrss.c:689:30 + vand q8, q7, q8 + vld1.64 {d12, d13}, [r0:128] @ 16-byte Reload +.Ltmp163: + .loc 1 707 60 is_stmt 1 @ ../crypto/hrss/hrss.c:707:60 + add r0, sp, #704 +.Ltmp164: + .loc 1 685 40 @ ../crypto/hrss/hrss.c:685:40 + veor q10, q15, q6 + .loc 1 691 12 @ ../crypto/hrss/hrss.c:691:12 + veor q9, q8, q9 + .loc 1 685 30 @ ../crypto/hrss/hrss.c:685:30 + vand q10, q7, q10 +.Ltmp165: + .loc 1 701 44 @ ../crypto/hrss/hrss.c:701:44 + vand q2, q9, q0 +.Ltmp166: + .loc 1 687 12 @ ../crypto/hrss/hrss.c:687:12 + veor q15, q10, q15 + .loc 1 686 12 @ ../crypto/hrss/hrss.c:686:12 + veor q10, q10, q6 +.Ltmp167: + .loc 1 701 32 @ ../crypto/hrss/hrss.c:701:32 + vand q1, q15, q3 +.Ltmp168: + .loc 1 690 12 @ ../crypto/hrss/hrss.c:690:12 + veor q8, q8, q5 +.Ltmp169: + .loc 1 701 38 @ ../crypto/hrss/hrss.c:701:38 + veor q1, q2, q1 + .loc 1 702 33 @ ../crypto/hrss/hrss.c:702:33 + vand q2, q15, q0 + .loc 1 702 44 is_stmt 0 @ ../crypto/hrss/hrss.c:702:44 + vand q0, q9, q3 + .loc 1 708 22 is_stmt 1 @ ../crypto/hrss/hrss.c:708:22 + vand q5, q1, q10 + .loc 1 702 38 @ ../crypto/hrss/hrss.c:702:38 + veor q0, q0, q2 + .loc 1 705 31 @ ../crypto/hrss/hrss.c:705:31 + vorr q2, q8, q10 + .loc 1 708 41 @ ../crypto/hrss/hrss.c:708:41 + vbic q3, q0, q2 + .loc 1 708 35 is_stmt 0 @ ../crypto/hrss/hrss.c:708:35 + veor q3, q3, q5 + .loc 1 706 34 is_stmt 1 @ ../crypto/hrss/hrss.c:706:34 + vorr q5, q1, q0 + .loc 1 707 41 @ ../crypto/hrss/hrss.c:707:41 + vbic q1, q1, q2 + .loc 1 708 60 @ ../crypto/hrss/hrss.c:708:60 + vbic q6, q8, q5 + .loc 1 707 22 @ ../crypto/hrss/hrss.c:707:22 + vand q8, q0, q8 + .loc 1 708 54 @ ../crypto/hrss/hrss.c:708:54 + veor q3, q3, q6 + .loc 1 707 60 @ ../crypto/hrss/hrss.c:707:60 + vbic q10, q10, q5 + .loc 1 707 35 is_stmt 0 @ ../crypto/hrss/hrss.c:707:35 + veor q8, q1, q8 + .loc 1 707 60 @ ../crypto/hrss/hrss.c:707:60 + vst1.64 {d6, d7}, [r0:128] @ 16-byte Spill +.Ltmp170: + .loc 1 203 33 is_stmt 1 @ ../crypto/hrss/hrss.c:203:33 + vshr.u16 q0, q9, #15 +.Ltmp171: + .loc 1 707 54 @ ../crypto/hrss/hrss.c:707:54 + veor q8, q8, q10 +.Ltmp172: + .loc 1 204 12 @ ../crypto/hrss/hrss.c:204:12 + add r0, sp, #688 + vmov.i32 q3, #0x0 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + vshl.i16 q8, q9, #1 + .loc 1 196 12 @ ../crypto/hrss/hrss.c:196:12 + add r0, sp, #208 + .loc 1 205 15 @ ../crypto/hrss/hrss.c:205:15 + vext.16 q9, q3, q0, #7 + .loc 1 206 12 @ ../crypto/hrss/hrss.c:206:12 + vorr q8, q9, q8 + .loc 1 195 33 @ ../crypto/hrss/hrss.c:195:33 + vshr.u16 q9, q15, #15 + .loc 1 196 12 @ ../crypto/hrss/hrss.c:196:12 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + vshl.i16 q8, q15, #1 + .loc 1 203 33 @ ../crypto/hrss/hrss.c:203:33 + add r0, sp, #368 + .loc 1 198 15 @ ../crypto/hrss/hrss.c:198:15 + vext.16 q10, q3, q9, #7 + .loc 1 199 12 @ ../crypto/hrss/hrss.c:199:12 + vorr q8, q10, q8 + .loc 1 203 33 @ ../crypto/hrss/hrss.c:203:33 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + add r0, sp, #288 + vld1.64 {d10, d11}, [r0:128] @ 16-byte Reload + add r0, sp, #256 + vshr.u16 q8, q5, #15 + vld1.64 {d2, d3}, [r0:128] @ 16-byte Reload + .loc 1 195 33 @ ../crypto/hrss/hrss.c:195:33 + add r0, sp, #256 + .loc 1 204 12 @ ../crypto/hrss/hrss.c:204:12 + vshl.i16 q10, q1, #1 + .loc 1 207 15 @ ../crypto/hrss/hrss.c:207:15 + vext.16 q15, q8, q3, #7 + .loc 1 205 12 @ ../crypto/hrss/hrss.c:205:12 + vorr q10, q15, q10 + .loc 1 205 15 is_stmt 0 @ ../crypto/hrss/hrss.c:205:15 + vext.16 q8, q3, q8, #7 + .loc 1 203 33 is_stmt 1 @ ../crypto/hrss/hrss.c:203:33 + vshr.u16 q15, q1, #15 + .loc 1 205 15 @ ../crypto/hrss/hrss.c:205:15 + vext.16 q15, q3, q15, #7 + .loc 1 206 12 @ ../crypto/hrss/hrss.c:206:12 + vorr q10, q10, q15 + .loc 1 195 33 @ ../crypto/hrss/hrss.c:195:33 + vst1.64 {d20, d21}, [r0:128] @ 16-byte Spill + add r0, sp, #272 + vld1.64 {d12, d13}, [r0:128] @ 16-byte Reload + add r0, sp, #240 + vshr.u16 q10, q6, #15 + vld1.64 {d4, d5}, [r0:128] @ 16-byte Reload + .loc 1 204 12 @ ../crypto/hrss/hrss.c:204:12 + add r0, sp, #240 + .loc 1 196 12 @ ../crypto/hrss/hrss.c:196:12 + vshl.i16 q15, q2, #1 + .loc 1 201 15 @ ../crypto/hrss/hrss.c:201:15 + vext.16 q1, q10, q3, #7 + .loc 1 198 12 @ ../crypto/hrss/hrss.c:198:12 + vorr q15, q1, q15 + .loc 1 198 15 is_stmt 0 @ ../crypto/hrss/hrss.c:198:15 + vext.16 q10, q3, q10, #7 + .loc 1 195 33 is_stmt 1 @ ../crypto/hrss/hrss.c:195:33 + vshr.u16 q1, q2, #15 + .loc 1 198 15 @ ../crypto/hrss/hrss.c:198:15 + vext.16 q1, q3, q1, #7 + .loc 1 199 12 @ ../crypto/hrss/hrss.c:199:12 + vorr q15, q15, q1 + .loc 1 203 33 @ ../crypto/hrss/hrss.c:203:33 + vshr.u16 q1, q12, #15 + .loc 1 204 12 @ ../crypto/hrss/hrss.c:204:12 + vst1.64 {d30, d31}, [r0:128] @ 16-byte Spill + vshl.i16 q15, q5, #1 + .loc 1 196 12 @ ../crypto/hrss/hrss.c:196:12 + add r0, sp, #288 + .loc 1 207 15 @ ../crypto/hrss/hrss.c:207:15 + vext.16 q2, q1, q3, #7 + .loc 1 205 12 @ ../crypto/hrss/hrss.c:205:12 + vorr q15, q2, q15 + .loc 1 206 12 @ ../crypto/hrss/hrss.c:206:12 + vorr q8, q15, q8 + .loc 1 196 12 @ ../crypto/hrss/hrss.c:196:12 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + add r0, sp, #304 + vshl.i16 q8, q6, #1 + vld1.64 {d10, d11}, [r0:128] @ 16-byte Reload + add r0, sp, #80 + .loc 1 195 33 @ ../crypto/hrss/hrss.c:195:33 + vshr.u16 q15, q5, #15 + .loc 1 201 15 @ ../crypto/hrss/hrss.c:201:15 + vext.16 q2, q15, q3, #7 + .loc 1 198 12 @ ../crypto/hrss/hrss.c:198:12 + vorr q8, q2, q8 + vld1.64 {d4, d5}, [r0:128] @ 16-byte Reload + .loc 1 204 12 @ ../crypto/hrss/hrss.c:204:12 + add r0, sp, #272 + .loc 1 199 12 @ ../crypto/hrss/hrss.c:199:12 + vorr q8, q8, q10 + .loc 1 198 15 @ ../crypto/hrss/hrss.c:198:15 + vext.16 q15, q3, q15, #7 + .loc 1 203 33 @ ../crypto/hrss/hrss.c:203:33 + vshr.u16 q10, q14, #15 + .loc 1 204 12 @ ../crypto/hrss/hrss.c:204:12 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + vshl.i16 q8, q12, #1 + .loc 1 196 12 @ ../crypto/hrss/hrss.c:196:12 + add r0, sp, #320 + .loc 1 207 15 @ ../crypto/hrss/hrss.c:207:15 + vext.16 q12, q10, q3, #7 + .loc 1 205 12 @ ../crypto/hrss/hrss.c:205:12 + vorr q8, q12, q8 + .loc 1 205 15 is_stmt 0 @ ../crypto/hrss/hrss.c:205:15 + vext.16 q12, q3, q1, #7 + .loc 1 206 12 is_stmt 1 @ ../crypto/hrss/hrss.c:206:12 + vorr q8, q8, q12 + .loc 1 205 15 @ ../crypto/hrss/hrss.c:205:15 + vext.16 q10, q3, q10, #7 + .loc 1 195 33 @ ../crypto/hrss/hrss.c:195:33 + vshr.u16 q12, q11, #15 + .loc 1 196 12 @ ../crypto/hrss/hrss.c:196:12 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + vshl.i16 q8, q5, #1 + add r0, sp, #160 + .loc 1 201 15 @ ../crypto/hrss/hrss.c:201:15 + vext.16 q1, q12, q3, #7 + .loc 1 198 12 @ ../crypto/hrss/hrss.c:198:12 + vorr q8, q1, q8 + vld1.64 {d2, d3}, [r0:128] @ 16-byte Reload + .loc 1 204 12 @ ../crypto/hrss/hrss.c:204:12 + add r0, sp, #304 + .loc 1 199 12 @ ../crypto/hrss/hrss.c:199:12 + vorr q8, q8, q15 + .loc 1 204 12 @ ../crypto/hrss/hrss.c:204:12 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + vshl.i16 q8, q14, #1 + .loc 1 203 33 @ ../crypto/hrss/hrss.c:203:33 + vshr.u16 q14, q4, #15 + add r0, sp, #112 + .loc 1 207 15 @ ../crypto/hrss/hrss.c:207:15 + vext.16 q15, q14, q3, #7 + .loc 1 205 12 @ ../crypto/hrss/hrss.c:205:12 + vorr q8, q15, q8 + vld1.64 {d30, d31}, [r0:128] @ 16-byte Reload + .loc 1 196 12 @ ../crypto/hrss/hrss.c:196:12 + add r0, sp, #224 + .loc 1 206 12 @ ../crypto/hrss/hrss.c:206:12 + vorr q8, q8, q10 + .loc 1 195 33 @ ../crypto/hrss/hrss.c:195:33 + vshr.u16 q10, q13, #15 + .loc 1 196 12 @ ../crypto/hrss/hrss.c:196:12 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + vshl.i16 q8, q11, #1 + .loc 1 207 15 @ ../crypto/hrss/hrss.c:207:15 + add r0, sp, #336 + .loc 1 201 15 @ ../crypto/hrss/hrss.c:201:15 + vext.16 q11, q10, q3, #7 + .loc 1 198 12 @ ../crypto/hrss/hrss.c:198:12 + vorr q8, q11, q8 + .loc 1 198 15 is_stmt 0 @ ../crypto/hrss/hrss.c:198:15 + vext.16 q11, q3, q12, #7 + vmov.i32 q12, #0x0 + .loc 1 199 12 is_stmt 1 @ ../crypto/hrss/hrss.c:199:12 + vorr q8, q8, q11 + .loc 1 204 12 @ ../crypto/hrss/hrss.c:204:12 + vshl.i16 q11, q4, #1 + .loc 1 207 15 @ ../crypto/hrss/hrss.c:207:15 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + add r0, sp, #176 + vext.16 q8, q0, q3, #7 + .loc 1 205 12 @ ../crypto/hrss/hrss.c:205:12 + vorr q8, q8, q11 + .loc 1 205 15 is_stmt 0 @ ../crypto/hrss/hrss.c:205:15 + vext.16 q11, q3, q14, #7 + .loc 1 206 12 is_stmt 1 @ ../crypto/hrss/hrss.c:206:12 + vorr q11, q8, q11 + .loc 1 201 15 @ ../crypto/hrss/hrss.c:201:15 + vext.16 q8, q9, q3, #7 + .loc 1 196 12 @ ../crypto/hrss/hrss.c:196:12 + vshl.i16 q9, q13, #1 + .loc 1 198 12 @ ../crypto/hrss/hrss.c:198:12 + vorr q8, q8, q9 + .loc 1 198 15 is_stmt 0 @ ../crypto/hrss/hrss.c:198:15 + vext.16 q9, q12, q10, #7 + vld1.64 {d20, d21}, [r0:128] @ 16-byte Reload +.Ltmp173: + .loc 2 304 38 is_stmt 1 @ ../crypto/hrss/../internal.h:304:38 + sub.w r0, r5, #2 + .loc 2 304 33 is_stmt 0 @ ../crypto/hrss/../internal.h:304:33 + and.w r0, r0, r4 +.Ltmp174: + .loc 1 199 12 is_stmt 1 @ ../crypto/hrss/hrss.c:199:12 + vorr q12, q8, q9 +.Ltmp175: + .loc 1 770 10 @ ../crypto/hrss/hrss.c:770:10 + sub.w r5, r5, #1 +.Ltmp176: + .loc 2 234 13 @ ../crypto/hrss/../internal.h:234:13 + asr.w r0, r0, #31 +.Ltmp177: + .loc 1 153 50 @ ../crypto/hrss/hrss.c:153:50 + vdup.16 q8, r0 + add r0, sp, #192 +.Ltmp178: + .loc 1 782 19 @ ../crypto/hrss/hrss.c:782:19 + vmvn q9, q8 + vld1.64 {d16, d17}, [r0:128] @ 16-byte Reload +.Ltmp179: + .loc 1 714 3 @ ../crypto/hrss/hrss.c:714:3 + add r0, sp, #912 + vld1.64 {d26, d27}, [r0:128] @ 16-byte Reload +.Ltmp180: + .loc 1 714 3 is_stmt 0 @ ../crypto/hrss/hrss.c:714:3 + add r0, sp, #144 + vorr q14, q9, q9 +.Ltmp181: + .loc 1 714 3 @ ../crypto/hrss/hrss.c:714:3 + vmov.32 r4, d26[0] +.Ltmp182: + .loc 1 714 3 @ ../crypto/hrss/hrss.c:714:3 + vld1.64 {d26, d27}, [r0:128] @ 16-byte Reload + vmov.32 r0, d26[0] +.Ltmp183: + .loc 1 775 56 is_stmt 1 @ ../crypto/hrss/hrss.c:775:56 + and.w r0, r0, r4 + .loc 1 771 12 @ ../crypto/hrss/hrss.c:771:12 + and r4, r4, #1 + .loc 1 771 7 is_stmt 0 @ ../crypto/hrss/hrss.c:771:7 + add r1, r4 +.Ltmp184: + .loc 2 343 16 is_stmt 1 @ ../crypto/hrss/../internal.h:343:16 + and.w r4, r0, r1 + .loc 2 343 30 is_stmt 0 @ ../crypto/hrss/../internal.h:343:30 + bic.w r0, lr, r0 + .loc 2 343 21 @ ../crypto/hrss/../internal.h:343:21 + orr.w lr, r0, r4 +.Ltmp185: + .loc 1 747 3 is_stmt 1 @ ../crypto/hrss/hrss.c:747:3 + bne.w .LBB0_3 +@ %bb.4: + .loc 1 786 3 @ ../crypto/hrss/hrss.c:786:3 + add r0, sp, #720 + add.w r1, r10, #16 + vld1.64 {d16, d17}, [r0:128] @ 16-byte Reload + add r0, sp, #736 + mov r5, r10 + .loc 1 787 3 @ ../crypto/hrss/hrss.c:787:3 + add.w r9, r10, #88 + .loc 1 786 3 @ ../crypto/hrss/hrss.c:786:3 + vst1.32 {d16, d17}, [r1] + add.w r1, r10, #32 + .loc 1 787 3 @ ../crypto/hrss/hrss.c:787:3 + mov r8, r9 + .loc 1 786 3 @ ../crypto/hrss/hrss.c:786:3 + vld1.64 {d16, d17}, [r0:128] @ 16-byte Reload + add r0, sp, #752 + vst1.32 {d16, d17}, [r1] + add.w r1, r10, #48 + vld1.64 {d16, d17}, [r0:128] @ 16-byte Reload + add r0, sp, #784 + vst1.32 {d16, d17}, [r1] + add.w r1, r10, #64 + vld1.64 {d16, d17}, [r0:128] @ 16-byte Reload + add r0, sp, #768 + vst1.32 {d16, d17}, [r1] + .loc 1 787 3 @ ../crypto/hrss/hrss.c:787:3 + movs r1, #104 + .loc 1 786 3 @ ../crypto/hrss/hrss.c:786:3 + vld1.64 {d16, d17}, [r0:128] @ 16-byte Reload + add r0, sp, #688 + vstr d16, [r10, #80] + vld1.64 {d16, d17}, [r0:128] @ 16-byte Reload + .loc 1 787 3 @ ../crypto/hrss/hrss.c:787:3 + add r0, sp, #800 + .loc 1 786 3 @ ../crypto/hrss/hrss.c:786:3 + vst1.16 {d16, d17}, [r5], r1 + .loc 1 787 3 @ ../crypto/hrss/hrss.c:787:3 + add.w r1, r10, #120 + vld1.64 {d16, d17}, [r0:128] @ 16-byte Reload + add r0, sp, #816 + vst1.32 {d16, d17}, [r5] + vld1.64 {d16, d17}, [r0:128] @ 16-byte Reload + add r0, sp, #832 + vst1.32 {d16, d17}, [r1] + add.w r1, r10, #136 + vld1.64 {d16, d17}, [r0:128] @ 16-byte Reload + add r0, sp, #848 + vst1.32 {d16, d17}, [r1] + add.w r1, r10, #152 + vld1.64 {d16, d17}, [r0:128] @ 16-byte Reload + add r0, sp, #704 + vst1.32 {d16, d17}, [r1] + movs r1, #80 + vld1.64 {d16, d17}, [r0:128] @ 16-byte Reload + add r0, sp, #864 + vst1.16 {d16, d17}, [r8], r1 +.Ltmp186: + .loc 2 270 42 @ ../crypto/hrss/../internal.h:270:42 + movw r1, #701 + sub.w r2, r1, lr + .loc 2 270 38 is_stmt 0 @ ../crypto/hrss/../internal.h:270:38 + orr.w r2, r2, lr +.Ltmp187: + .loc 1 787 3 is_stmt 1 @ ../crypto/hrss/hrss.c:787:3 + vld1.64 {d16, d17}, [r0:128] @ 16-byte Reload +.Ltmp188: + .loc 1 461 3 @ ../crypto/hrss/hrss.c:461:3 + mov r0, r10 +.Ltmp189: + .loc 1 785 17 @ ../crypto/hrss/hrss.c:785:17 + and.w r1, r1, r2, asr #31 + .loc 1 787 3 @ ../crypto/hrss/hrss.c:787:3 + vstr d16, [r8] + .loc 1 785 12 @ ../crypto/hrss/hrss.c:785:12 + sub.w r6, lr, r1 +.Ltmp190: + .loc 1 461 3 @ ../crypto/hrss/hrss.c:461:3 + mov r1, r6 + bl poly2_rotr_consttime + .loc 1 462 3 @ ../crypto/hrss/hrss.c:462:3 + mov r0, r9 + mov r1, r6 + bl poly2_rotr_consttime +.Ltmp191: + .loc 1 714 3 @ ../crypto/hrss/hrss.c:714:3 + add.w lr, sp, #880 +.Ltmp192: + .loc 1 789 3 @ ../crypto/hrss/hrss.c:789:3 + mov r0, r10 +.Ltmp193: + .loc 1 714 3 @ ../crypto/hrss/hrss.c:714:3 + vld1.64 {d16, d17}, [lr:128] @ 16-byte Reload +.Ltmp194: + .loc 1 714 3 is_stmt 0 @ ../crypto/hrss/hrss.c:714:3 + add.w lr, sp, #896 +.Ltmp195: + .loc 1 714 3 @ ../crypto/hrss/hrss.c:714:3 + vmov.32 r1, d16[0] +.Ltmp196: + .loc 1 714 3 @ ../crypto/hrss/hrss.c:714:3 + vld1.64 {d16, d17}, [lr:128] @ 16-byte Reload + vmov.32 r2, d16[0] +.Ltmp197: + .loc 1 789 3 is_stmt 1 @ ../crypto/hrss/hrss.c:789:3 + bl poly3_mul_const + movs r0, #84 +.Ltmp198: + .loc 1 500 44 @ ../crypto/hrss/hrss.c:500:44 + ldr.w r1, [r10, #84] + .loc 1 505 22 @ ../crypto/hrss/hrss.c:505:22 + vld1.32 {d19}, [r9], r0 + movs r6, #112 +.Ltmp199: + .loc 1 791 1 @ ../crypto/hrss/hrss.c:791:1 + sub.w r4, r7, #88 +.Ltmp200: + .loc 1 499 44 @ ../crypto/hrss/hrss.c:499:44 + ldr.w r0, [r9] +.Ltmp201: + .loc 1 489 35 @ ../crypto/hrss/hrss.c:489:35 + orr.w r3, r1, r0 +.Ltmp202: + .loc 1 489 75 is_stmt 0 @ ../crypto/hrss/hrss.c:489:75 + sbfx r2, r0, #28, #1 +.Ltmp203: + .loc 1 504 22 is_stmt 1 @ ../crypto/hrss/hrss.c:504:22 + mov r0, r10 + vld1.32 {d20}, [r0], r6 +.Ltmp204: + .loc 1 489 75 @ ../crypto/hrss/hrss.c:489:75 + sbfx r1, r1, #28, #1 +.Ltmp205: + .loc 1 506 27 @ ../crypto/hrss/hrss.c:506:27 + vorr d21, d19, d20 + .loc 1 503 3 @ ../crypto/hrss/hrss.c:503:3 + vdup.32 d16, r1 +.Ltmp206: + .loc 1 489 75 @ ../crypto/hrss/hrss.c:489:75 + lsls r1, r3, #3 +.Ltmp207: + .loc 1 501 21 @ ../crypto/hrss/hrss.c:501:21 + mvn.w r1, r1, asr #31 + .loc 1 507 20 @ ../crypto/hrss/hrss.c:507:20 + vand d22, d19, d16 + .loc 1 503 3 @ ../crypto/hrss/hrss.c:503:3 + vdup.32 d18, r1 + .loc 1 507 15 @ ../crypto/hrss/hrss.c:507:15 + mov r1, r10 + .loc 1 507 56 is_stmt 0 @ ../crypto/hrss/hrss.c:507:56 + vand d23, d20, d18 + .loc 1 503 3 is_stmt 1 @ ../crypto/hrss/hrss.c:503:3 + vdup.32 d17, r2 + .loc 1 507 38 @ ../crypto/hrss/hrss.c:507:38 + vbic d24, d17, d21 + .loc 1 505 22 @ ../crypto/hrss/hrss.c:505:22 + movs r2, #96 + .loc 1 507 32 @ ../crypto/hrss/hrss.c:507:32 + veor d22, d22, d23 + .loc 1 504 22 @ ../crypto/hrss/hrss.c:504:22 + vldr d25, [r10, #16] + vldr d26, [r10, #24] + .loc 1 508 20 @ ../crypto/hrss/hrss.c:508:20 + vand d30, d25, d17 + .loc 1 507 50 @ ../crypto/hrss/hrss.c:507:50 + veor d22, d22, d24 + .loc 1 504 22 @ ../crypto/hrss/hrss.c:504:22 + vldr d23, [r10, #8] + .loc 1 507 15 @ ../crypto/hrss/hrss.c:507:15 + vst1.32 {d22}, [r1], r2 + .loc 1 508 20 @ ../crypto/hrss/hrss.c:508:20 + vand d24, d23, d17 + vand d3, d26, d17 + .loc 1 505 22 @ ../crypto/hrss/hrss.c:505:22 + vldr d22, [r1] + .loc 1 508 20 @ ../crypto/hrss/hrss.c:508:20 + vand d20, d20, d17 + .loc 1 505 22 @ ../crypto/hrss/hrss.c:505:22 + vldr d27, [r5] + .loc 1 506 27 @ ../crypto/hrss/hrss.c:506:27 + vorr d31, d22, d23 + .loc 1 508 56 @ ../crypto/hrss/hrss.c:508:56 + vand d28, d22, d18 + .loc 1 505 22 @ ../crypto/hrss/hrss.c:505:22 + vldr d0, [r0] + .loc 1 506 27 @ ../crypto/hrss/hrss.c:506:27 + vorr d29, d27, d25 + .loc 1 508 56 @ ../crypto/hrss/hrss.c:508:56 + vand d1, d27, d18 + .loc 1 506 27 @ ../crypto/hrss/hrss.c:506:27 + vorr d2, d0, d26 + .loc 1 508 56 @ ../crypto/hrss/hrss.c:508:56 + vand d4, d0, d18 + .loc 1 508 32 is_stmt 0 @ ../crypto/hrss/hrss.c:508:32 + veor d24, d28, d24 + .loc 1 508 38 @ ../crypto/hrss/hrss.c:508:38 + vbic d5, d16, d31 + .loc 1 508 32 @ ../crypto/hrss/hrss.c:508:32 + veor d30, d1, d30 + .loc 1 508 38 @ ../crypto/hrss/hrss.c:508:38 + vbic d28, d16, d29 + vbic d1, d16, d2 + .loc 1 508 32 @ ../crypto/hrss/hrss.c:508:32 + veor d3, d4, d3 + .loc 1 508 50 @ ../crypto/hrss/hrss.c:508:50 + veor d24, d24, d5 + .loc 1 508 56 @ ../crypto/hrss/hrss.c:508:56 + vand d19, d19, d18 + .loc 1 508 50 @ ../crypto/hrss/hrss.c:508:50 + veor d28, d30, d28 + veor d30, d3, d1 + .loc 1 508 15 @ ../crypto/hrss/hrss.c:508:15 + vstr d24, [r1] + .loc 1 508 32 @ ../crypto/hrss/hrss.c:508:32 + veor d19, d19, d20 + .loc 1 504 22 is_stmt 1 @ ../crypto/hrss/hrss.c:504:22 + vldr d24, [r10, #32] + .loc 1 508 38 @ ../crypto/hrss/hrss.c:508:38 + vbic d21, d16, d21 + .loc 1 507 20 @ ../crypto/hrss/hrss.c:507:20 + vand d27, d27, d16 + .loc 1 507 56 is_stmt 0 @ ../crypto/hrss/hrss.c:507:56 + vand d25, d25, d18 + .loc 1 507 20 @ ../crypto/hrss/hrss.c:507:20 + vand d0, d0, d16 + .loc 1 507 56 @ ../crypto/hrss/hrss.c:507:56 + vand d26, d26, d18 + vand d23, d23, d18 + .loc 1 507 20 @ ../crypto/hrss/hrss.c:507:20 + vand d22, d22, d16 + .loc 1 507 38 @ ../crypto/hrss/hrss.c:507:38 + vbic d20, d17, d29 + vbic d29, d17, d31 + .loc 1 505 22 is_stmt 1 @ ../crypto/hrss/hrss.c:505:22 + vldr d31, [r10, #120] + .loc 1 508 15 @ ../crypto/hrss/hrss.c:508:15 + vstr d28, [r5] + .loc 1 507 32 @ ../crypto/hrss/hrss.c:507:32 + veor d22, d22, d23 + .loc 1 508 15 @ ../crypto/hrss/hrss.c:508:15 + vstr d30, [r0] + .loc 1 507 32 @ ../crypto/hrss/hrss.c:507:32 + veor d25, d27, d25 + .loc 1 505 22 @ ../crypto/hrss/hrss.c:505:22 + vldr d1, [r10, #128] + .loc 1 507 32 @ ../crypto/hrss/hrss.c:507:32 + veor d26, d0, d26 + .loc 1 504 22 @ ../crypto/hrss/hrss.c:504:22 + vldr d30, [r10, #40] + .loc 1 508 50 @ ../crypto/hrss/hrss.c:508:50 + veor d19, d19, d21 + .loc 1 507 38 @ ../crypto/hrss/hrss.c:507:38 + vbic d27, d17, d2 + .loc 1 508 20 @ ../crypto/hrss/hrss.c:508:20 + vand d23, d24, d17 + .loc 1 507 56 @ ../crypto/hrss/hrss.c:507:56 + vand d28, d24, d18 + .loc 1 508 15 @ ../crypto/hrss/hrss.c:508:15 + vstr d19, [r10, #88] + .loc 1 506 27 @ ../crypto/hrss/hrss.c:506:27 + vorr d21, d1, d30 + .loc 1 507 20 @ ../crypto/hrss/hrss.c:507:20 + vand d0, d31, d16 + .loc 1 506 27 @ ../crypto/hrss/hrss.c:506:27 + vorr d24, d31, d24 + .loc 1 508 56 @ ../crypto/hrss/hrss.c:508:56 + vand d31, d31, d18 + .loc 1 507 56 @ ../crypto/hrss/hrss.c:507:56 + vand d2, d30, d18 + .loc 1 508 20 @ ../crypto/hrss/hrss.c:508:20 + vand d4, d30, d17 + .loc 1 507 20 @ ../crypto/hrss/hrss.c:507:20 + vand d3, d1, d16 + .loc 1 508 56 @ ../crypto/hrss/hrss.c:508:56 + vand d5, d1, d18 + .loc 1 507 50 @ ../crypto/hrss/hrss.c:507:50 + veor d20, d25, d20 + veor d25, d26, d27 + veor d22, d22, d29 + .loc 1 507 32 is_stmt 0 @ ../crypto/hrss/hrss.c:507:32 + veor d26, d0, d28 + .loc 1 507 15 @ ../crypto/hrss/hrss.c:507:15 + vstr d20, [r10, #16] + .loc 1 507 38 @ ../crypto/hrss/hrss.c:507:38 + vbic d27, d17, d24 + .loc 1 507 15 @ ../crypto/hrss/hrss.c:507:15 + vstr d25, [r10, #24] + .loc 1 508 32 is_stmt 1 @ ../crypto/hrss/hrss.c:508:32 + veor d23, d31, d23 + .loc 1 507 15 @ ../crypto/hrss/hrss.c:507:15 + vstr d22, [r10, #8] + .loc 1 508 38 @ ../crypto/hrss/hrss.c:508:38 + vbic d24, d16, d24 + .loc 1 507 32 @ ../crypto/hrss/hrss.c:507:32 + veor d28, d3, d2 + .loc 1 507 38 is_stmt 0 @ ../crypto/hrss/hrss.c:507:38 + vbic d19, d17, d21 + .loc 1 508 38 is_stmt 1 @ ../crypto/hrss/hrss.c:508:38 + vbic d30, d16, d21 + .loc 1 508 32 is_stmt 0 @ ../crypto/hrss/hrss.c:508:32 + veor d29, d5, d4 + .loc 1 507 50 is_stmt 1 @ ../crypto/hrss/hrss.c:507:50 + veor d20, d26, d27 + .loc 1 508 50 @ ../crypto/hrss/hrss.c:508:50 + veor d21, d23, d24 + .loc 1 507 50 @ ../crypto/hrss/hrss.c:507:50 + veor d19, d28, d19 + .loc 1 508 50 @ ../crypto/hrss/hrss.c:508:50 + veor d22, d29, d30 + .loc 1 507 15 @ ../crypto/hrss/hrss.c:507:15 + vstr d20, [r10, #32] + .loc 1 508 15 @ ../crypto/hrss/hrss.c:508:15 + vstr d21, [r10, #120] + .loc 1 507 15 @ ../crypto/hrss/hrss.c:507:15 + vstr d19, [r10, #40] + .loc 1 508 15 @ ../crypto/hrss/hrss.c:508:15 + vstr d22, [r10, #128] + .loc 1 504 22 @ ../crypto/hrss/hrss.c:504:22 + vldr d19, [r10, #48] + .loc 1 505 22 @ ../crypto/hrss/hrss.c:505:22 + vldr d20, [r10, #136] + .loc 1 507 56 @ ../crypto/hrss/hrss.c:507:56 + vand d21, d19, d18 + .loc 1 507 20 is_stmt 0 @ ../crypto/hrss/hrss.c:507:20 + vand d22, d20, d16 + .loc 1 506 27 is_stmt 1 @ ../crypto/hrss/hrss.c:506:27 + vorr d23, d20, d19 + .loc 1 508 20 @ ../crypto/hrss/hrss.c:508:20 + vand d19, d19, d17 + .loc 1 508 56 is_stmt 0 @ ../crypto/hrss/hrss.c:508:56 + vand d20, d20, d18 + .loc 1 507 32 is_stmt 1 @ ../crypto/hrss/hrss.c:507:32 + veor d21, d22, d21 + .loc 1 507 38 is_stmt 0 @ ../crypto/hrss/hrss.c:507:38 + vbic d22, d17, d23 + .loc 1 508 32 is_stmt 1 @ ../crypto/hrss/hrss.c:508:32 + veor d19, d20, d19 + .loc 1 508 38 is_stmt 0 @ ../crypto/hrss/hrss.c:508:38 + vbic d20, d16, d23 + .loc 1 507 50 is_stmt 1 @ ../crypto/hrss/hrss.c:507:50 + veor d21, d21, d22 + .loc 1 508 50 @ ../crypto/hrss/hrss.c:508:50 + veor d19, d19, d20 + .loc 1 507 15 @ ../crypto/hrss/hrss.c:507:15 + vstr d21, [r10, #48] + .loc 1 508 15 @ ../crypto/hrss/hrss.c:508:15 + vstr d19, [r10, #136] + .loc 1 504 22 @ ../crypto/hrss/hrss.c:504:22 + vldr d19, [r10, #56] + .loc 1 505 22 @ ../crypto/hrss/hrss.c:505:22 + vldr d20, [r10, #144] + .loc 1 507 56 @ ../crypto/hrss/hrss.c:507:56 + vand d21, d19, d18 + .loc 1 507 20 is_stmt 0 @ ../crypto/hrss/hrss.c:507:20 + vand d22, d20, d16 + .loc 1 506 27 is_stmt 1 @ ../crypto/hrss/hrss.c:506:27 + vorr d23, d20, d19 + .loc 1 508 20 @ ../crypto/hrss/hrss.c:508:20 + vand d19, d19, d17 + .loc 1 508 56 is_stmt 0 @ ../crypto/hrss/hrss.c:508:56 + vand d20, d20, d18 + .loc 1 507 32 is_stmt 1 @ ../crypto/hrss/hrss.c:507:32 + veor d21, d22, d21 + .loc 1 507 38 is_stmt 0 @ ../crypto/hrss/hrss.c:507:38 + vbic d22, d17, d23 + .loc 1 508 32 is_stmt 1 @ ../crypto/hrss/hrss.c:508:32 + veor d19, d20, d19 + .loc 1 508 38 is_stmt 0 @ ../crypto/hrss/hrss.c:508:38 + vbic d20, d16, d23 + .loc 1 507 50 is_stmt 1 @ ../crypto/hrss/hrss.c:507:50 + veor d21, d21, d22 + .loc 1 508 50 @ ../crypto/hrss/hrss.c:508:50 + veor d19, d19, d20 + .loc 1 507 15 @ ../crypto/hrss/hrss.c:507:15 + vstr d21, [r10, #56] + .loc 1 508 15 @ ../crypto/hrss/hrss.c:508:15 + vstr d19, [r10, #144] + .loc 1 504 22 @ ../crypto/hrss/hrss.c:504:22 + vldr d19, [r10, #64] + .loc 1 505 22 @ ../crypto/hrss/hrss.c:505:22 + vldr d20, [r10, #152] + .loc 1 507 56 @ ../crypto/hrss/hrss.c:507:56 + vand d21, d19, d18 + .loc 1 507 20 is_stmt 0 @ ../crypto/hrss/hrss.c:507:20 + vand d22, d20, d16 + .loc 1 506 27 is_stmt 1 @ ../crypto/hrss/hrss.c:506:27 + vorr d23, d20, d19 + .loc 1 508 20 @ ../crypto/hrss/hrss.c:508:20 + vand d19, d19, d17 + .loc 1 508 56 is_stmt 0 @ ../crypto/hrss/hrss.c:508:56 + vand d20, d20, d18 + .loc 1 507 32 is_stmt 1 @ ../crypto/hrss/hrss.c:507:32 + veor d21, d22, d21 + .loc 1 507 38 is_stmt 0 @ ../crypto/hrss/hrss.c:507:38 + vbic d22, d17, d23 + .loc 1 508 32 is_stmt 1 @ ../crypto/hrss/hrss.c:508:32 + veor d19, d20, d19 + .loc 1 508 38 is_stmt 0 @ ../crypto/hrss/hrss.c:508:38 + vbic d20, d16, d23 + .loc 1 507 50 is_stmt 1 @ ../crypto/hrss/hrss.c:507:50 + veor d21, d21, d22 + .loc 1 508 50 @ ../crypto/hrss/hrss.c:508:50 + veor d19, d19, d20 + .loc 1 507 15 @ ../crypto/hrss/hrss.c:507:15 + vstr d21, [r10, #64] + .loc 1 508 15 @ ../crypto/hrss/hrss.c:508:15 + vstr d19, [r10, #152] + .loc 1 504 22 @ ../crypto/hrss/hrss.c:504:22 + vldr d19, [r10, #72] + .loc 1 505 22 @ ../crypto/hrss/hrss.c:505:22 + vldr d20, [r10, #160] + .loc 1 507 56 @ ../crypto/hrss/hrss.c:507:56 + vand d21, d19, d18 + .loc 1 507 20 is_stmt 0 @ ../crypto/hrss/hrss.c:507:20 + vand d22, d20, d16 + .loc 1 506 27 is_stmt 1 @ ../crypto/hrss/hrss.c:506:27 + vorr d23, d20, d19 + .loc 1 508 20 @ ../crypto/hrss/hrss.c:508:20 + vand d19, d19, d17 + .loc 1 508 56 is_stmt 0 @ ../crypto/hrss/hrss.c:508:56 + vand d20, d20, d18 + .loc 1 507 32 is_stmt 1 @ ../crypto/hrss/hrss.c:507:32 + veor d21, d22, d21 + .loc 1 507 38 is_stmt 0 @ ../crypto/hrss/hrss.c:507:38 + vbic d22, d17, d23 + .loc 1 508 32 is_stmt 1 @ ../crypto/hrss/hrss.c:508:32 + veor d19, d20, d19 + .loc 1 508 38 is_stmt 0 @ ../crypto/hrss/hrss.c:508:38 + vbic d20, d16, d23 + .loc 1 507 50 is_stmt 1 @ ../crypto/hrss/hrss.c:507:50 + veor d21, d21, d22 + .loc 1 508 50 @ ../crypto/hrss/hrss.c:508:50 + veor d19, d19, d20 + .loc 1 507 15 @ ../crypto/hrss/hrss.c:507:15 + vstr d21, [r10, #72] + .loc 1 508 15 @ ../crypto/hrss/hrss.c:508:15 + vstr d19, [r10, #160] + .loc 1 505 22 @ ../crypto/hrss/hrss.c:505:22 + vldr d19, [r8] + .loc 1 504 22 @ ../crypto/hrss/hrss.c:504:22 + vldr d20, [r10, #80] + .loc 1 507 20 @ ../crypto/hrss/hrss.c:507:20 + vand d22, d19, d16 + .loc 1 506 27 @ ../crypto/hrss/hrss.c:506:27 + vorr d21, d19, d20 + .loc 1 507 56 @ ../crypto/hrss/hrss.c:507:56 + vand d23, d20, d18 + .loc 1 508 56 @ ../crypto/hrss/hrss.c:508:56 + vand d18, d19, d18 + .loc 1 508 20 is_stmt 0 @ ../crypto/hrss/hrss.c:508:20 + vand d19, d20, d17 + .loc 1 507 38 is_stmt 1 @ ../crypto/hrss/hrss.c:507:38 + vbic d17, d17, d21 + .loc 1 507 32 is_stmt 0 @ ../crypto/hrss/hrss.c:507:32 + veor d20, d22, d23 + .loc 1 508 38 is_stmt 1 @ ../crypto/hrss/hrss.c:508:38 + vbic d16, d16, d21 + .loc 1 508 32 is_stmt 0 @ ../crypto/hrss/hrss.c:508:32 + veor d18, d18, d19 + .loc 1 507 50 is_stmt 1 @ ../crypto/hrss/hrss.c:507:50 + veor d17, d20, d17 + .loc 1 508 50 @ ../crypto/hrss/hrss.c:508:50 + veor d16, d18, d16 + .loc 1 507 15 @ ../crypto/hrss/hrss.c:507:15 + vstr d17, [r10, #80] + .loc 1 511 30 @ ../crypto/hrss/hrss.c:511:30 + ldr.w r0, [r10, #84] + .loc 1 508 15 @ ../crypto/hrss/hrss.c:508:15 + vstr d16, [r8] + .loc 1 512 30 @ ../crypto/hrss/hrss.c:512:30 + ldr.w r1, [r9] + .loc 1 511 30 @ ../crypto/hrss/hrss.c:511:30 + bic r0, r0, #-536870912 + str.w r0, [r10, #84] + .loc 1 512 30 @ ../crypto/hrss/hrss.c:512:30 + bic r0, r1, #-536870912 + str.w r0, [r9] +.Ltmp208: + .loc 1 791 1 @ ../crypto/hrss/hrss.c:791:1 + mov sp, r4 + vpop {d8, d9, d10, d11, d12, d13, d14, d15} + pop.w {r8, r9, r10} + pop {r4, r5, r6, r7, pc} +.Ltmp209: +@ %bb.5: +.Lfunc_end0: + .size poly3_invert_vec, .Lfunc_end0-poly3_invert_vec + .cfi_endproc + .fnend + + .section .text.poly_mul_vec,"ax",%progbits + .hidden poly_mul_vec @ -- Begin function poly_mul_vec + .globl poly_mul_vec + .p2align 2 + .type poly_mul_vec,%function + .code 16 @ @poly_mul_vec + .thumb_func +poly_mul_vec: +.Lfunc_begin2: + .loc 1 1087 0 @ ../crypto/hrss/hrss.c:1087:0 + .fnstart + .cfi_startproc +@ %bb.0: + .save {r4, r5, r6, r7, lr} + push {r4, r5, r6, r7, lr} + .cfi_def_cfa_offset 20 + .cfi_offset lr, -4 + .cfi_offset r7, -8 + .cfi_offset r6, -12 + .cfi_offset r5, -16 + .cfi_offset r4, -20 + .setfp r7, sp, #12 + add r7, sp, #12 + .cfi_def_cfa r7, 8 + .save {r8, r9, r11} + push.w {r8, r9, r11} + .cfi_offset r11, -24 + .cfi_offset r9, -28 + .cfi_offset r8, -32 + .pad #5600 + sub.w sp, sp, #5600 + mov r4, sp + bfc r4, #0, #4 + mov sp, r4 + mov r4, r0 + ldr r0, .LCPI2_0 + add.w r8, sp, #12 + movs r6, #0 +.LPC2_0: + add r0, pc + add.w r5, sp, #2768 + mov r3, r2 + mov r2, r1 + ldr.w r9, [r0] + ldr.w r0, [r9] + str.w r0, [r8] +.Ltmp218: + .loc 1 1098 3 prologue_end @ ../crypto/hrss/hrss.c:1098:3 + movs r0, #88 +.Ltmp219: + .loc 2 713 10 @ ../crypto/hrss/../internal.h:713:10 + strh.w r6, [r1, #1406] + str.w r6, [r1, #1402] + add r1, sp, #16 +.Ltmp220: + .loc 2 713 10 is_stmt 0 @ ../crypto/hrss/../internal.h:713:10 + strh.w r6, [r3, #1406] + str.w r6, [r3, #1402] +.Ltmp221: + .loc 1 1098 3 is_stmt 1 @ ../crypto/hrss/hrss.c:1098:3 + str r0, [sp] + mov r0, r5 + bl poly_mul_vec_aux + add.w r0, r5, #1392 + .loc 1 1108 24 @ ../crypto/hrss/hrss.c:1108:24 + vld1.64 {d16, d17}, [r0:128] + mov.w r0, #1408 +.LBB2_1: @ =>This Inner Loop Header: Depth=1 + .loc 1 1109 24 @ ../crypto/hrss/hrss.c:1109:24 + adds r1, r5, r6 + .loc 1 1110 27 @ ../crypto/hrss/hrss.c:1110:27 + vld1.16 {d18, d19}, [r1:128], r0 + .loc 1 1109 24 @ ../crypto/hrss/hrss.c:1109:24 + vld1.64 {d20, d21}, [r1:128] + .loc 1 1110 17 @ ../crypto/hrss/hrss.c:1110:17 + adds r1, r4, r6 + .loc 1 1107 24 @ ../crypto/hrss/hrss.c:1107:24 + adds r6, #16 +.Ltmp222: + .loc 1 181 10 @ ../crypto/hrss/hrss.c:181:10 + vext.16 q8, q8, q10, #5 +.Ltmp223: + .loc 1 1107 3 @ ../crypto/hrss/hrss.c:1107:3 + cmp.w r6, #1408 +.Ltmp224: + .loc 1 155 58 @ ../crypto/hrss/hrss.c:155:58 + vadd.i16 q8, q8, q9 +.Ltmp225: + .loc 1 1110 17 @ ../crypto/hrss/hrss.c:1110:17 + vst1.64 {d16, d17}, [r1:128] + .loc 1 1108 24 @ ../crypto/hrss/hrss.c:1108:24 + vorr q8, q10, q10 + .loc 1 1107 3 @ ../crypto/hrss/hrss.c:1107:3 + bne .LBB2_1 +@ %bb.2: +.Ltmp226: + .loc 2 713 10 @ ../crypto/hrss/../internal.h:713:10 + movs r0, #0 + strh.w r0, [r4, #1406] + str.w r0, [r4, #1402] + ldr.w r0, [r8] + ldr.w r1, [r9] + subs r0, r1, r0 +.Ltmp227: + .loc 1 1114 1 @ ../crypto/hrss/hrss.c:1114:1 + itttt eq + subeq.w r4, r7, #24 + moveq sp, r4 + popeq.w {r8, r9, r11} + popeq {r4, r5, r6, r7, pc} + bl __stack_chk_fail +.Ltmp228: + .p2align 2 +@ %bb.3: + .loc 1 0 1 is_stmt 0 @ ../crypto/hrss/hrss.c:0:1 +.LCPI2_0: +.Ltmp229: + .long __stack_chk_guard(GOT_PREL)-((.LPC2_0+4)-.Ltmp229) +.Lfunc_end2: + .size poly_mul_vec, .Lfunc_end2-poly_mul_vec + .cfi_endproc + .fnend + @ -- End function + .section .text.poly_mul_vec_aux,"ax",%progbits + .p2align 1 @ -- Begin function poly_mul_vec_aux + .type poly_mul_vec_aux,%function + .code 16 @ @poly_mul_vec_aux + .thumb_func +poly_mul_vec_aux: +.Lfunc_begin3: + .loc 1 897 0 is_stmt 1 @ ../crypto/hrss/hrss.c:897:0 + .fnstart + .cfi_startproc +@ %bb.0: + .save {r4, r5, r6, r7, lr} + push {r4, r5, r6, r7, lr} + .cfi_def_cfa_offset 20 + .cfi_offset lr, -4 + .cfi_offset r7, -8 + .cfi_offset r6, -12 + .cfi_offset r5, -16 + .cfi_offset r4, -20 + .setfp r7, sp, #12 + add r7, sp, #12 + .cfi_def_cfa r7, 8 + .save {r8, r9, r10, r11} + push.w {r8, r9, r10, r11} + .cfi_offset r11, -24 + .cfi_offset r10, -28 + .cfi_offset r9, -32 + .cfi_offset r8, -36 + .pad #4 + sub sp, #4 + .vsave {d8, d9, d10, d11, d12, d13, d14, d15} + vpush {d8, d9, d10, d11, d12, d13, d14, d15} + .cfi_offset d15, -48 + .cfi_offset d14, -56 + .cfi_offset d13, -64 + .cfi_offset d12, -72 + .cfi_offset d11, -80 + .cfi_offset d10, -88 + .cfi_offset d9, -96 + .cfi_offset d8, -104 + .pad #856 + sub.w sp, sp, #856 + mov r4, sp + bfc r4, #0, #4 + mov sp, r4 + mov r9, r1 + ldr r1, [r7, #8] + mov r8, r3 + mov r10, r2 + mov lr, r0 +.Ltmp230: + .loc 1 898 7 prologue_end @ ../crypto/hrss/hrss.c:898:7 + cmp r1, #3 + beq.w .LBB3_3 +@ %bb.1: + cmp r1, #2 + bne.w .LBB3_4 +@ %bb.2: + .loc 1 902 16 @ ../crypto/hrss/hrss.c:902:16 + vld1.16 {d20, d21}, [r10:128]! + .loc 1 903 16 @ ../crypto/hrss/hrss.c:903:16 + add r0, sp, #816 +.Ltmp231: + .loc 1 167 10 @ ../crypto/hrss/hrss.c:167:10 + vmov.i32 q8, #0x0 +.Ltmp232: + .loc 1 952 5 @ ../crypto/hrss/hrss.c:952:5 + movs r1, #30 + .loc 1 903 16 @ ../crypto/hrss/hrss.c:903:16 + vld1.64 {d22, d23}, [r10:128] +.Ltmp233: + .loc 1 167 10 @ ../crypto/hrss/hrss.c:167:10 + vmov.i32 q9, #0x0 +.Ltmp234: + .loc 1 903 16 @ ../crypto/hrss/hrss.c:903:16 + vst1.64 {d20, d21}, [r0:128] @ 16-byte Spill +.Ltmp235: + .loc 1 167 10 @ ../crypto/hrss/hrss.c:167:10 + add r0, sp, #768 + vext.16 q13, q11, q8, #7 + vst1.64 {d22, d23}, [r0:128] @ 16-byte Spill + add r0, sp, #640 + vst1.64 {d26, d27}, [r0:128] @ 16-byte Spill +.Ltmp236: + .loc 1 921 5 @ ../crypto/hrss/hrss.c:921:5 + add.w r0, r8, #2 +.Ltmp237: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.16 {d16[], d17[]}, [r0:16] +.Ltmp238: + .loc 1 162 10 is_stmt 0 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #784 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + vmul.i16 q8, q8, q13 +.Ltmp239: + .loc 1 910 46 is_stmt 1 @ ../crypto/hrss/hrss.c:910:46 + add.w r0, r8, #16 +.Ltmp240: + .loc 1 168 10 @ ../crypto/hrss/hrss.c:168:10 + vext.16 q4, q10, q11, #7 +.Ltmp241: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.16 {d24[], d25[]}, [r0:16] +.Ltmp242: + .loc 1 168 10 @ ../crypto/hrss/hrss.c:168:10 + add r0, sp, #736 +.Ltmp243: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q8, q12, q11 +.Ltmp244: + .loc 1 168 10 @ ../crypto/hrss/hrss.c:168:10 + vst1.64 {d24, d25}, [r0:128] @ 16-byte Spill +.Ltmp245: + .loc 1 922 5 @ ../crypto/hrss/hrss.c:922:5 + add.w r0, r8, #18 +.Ltmp246: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.16 {d22[], d23[]}, [r0:16] +.Ltmp247: + .loc 1 167 10 @ ../crypto/hrss/hrss.c:167:10 + add r0, sp, #832 +.Ltmp248: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q8, q11, q4 +.Ltmp249: + .loc 1 167 10 @ ../crypto/hrss/hrss.c:167:10 + vst1.64 {d22, d23}, [r0:128] @ 16-byte Spill +.Ltmp250: + .loc 1 926 5 @ ../crypto/hrss/hrss.c:926:5 + add.w r0, r8, #4 +.Ltmp251: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.16 {d22[], d23[]}, [r0:16] +.Ltmp252: + .loc 1 162 10 is_stmt 0 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #752 +.Ltmp253: + .loc 1 167 10 is_stmt 1 @ ../crypto/hrss/hrss.c:167:10 + vext.16 q13, q4, q13, #7 +.Ltmp254: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vst1.64 {d22, d23}, [r0:128] @ 16-byte Spill + vmla.i16 q8, q11, q13 +.Ltmp255: + .loc 1 927 5 @ ../crypto/hrss/hrss.c:927:5 + add r0, sp, #672 +.Ltmp256: + .loc 1 169 10 @ ../crypto/hrss/hrss.c:169:10 + vext.16 q11, q9, q10, #7 +.Ltmp257: + .loc 1 927 5 @ ../crypto/hrss/hrss.c:927:5 + vst1.64 {d22, d23}, [r0:128] @ 16-byte Spill + add.w r0, r8, #20 +.Ltmp258: + .loc 1 168 10 @ ../crypto/hrss/hrss.c:168:10 + vext.16 q1, q11, q4, #7 +.Ltmp259: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.16 {d6[], d7[]}, [r0:16] +.Ltmp260: + .loc 1 931 5 @ ../crypto/hrss/hrss.c:931:5 + add.w r0, r8, #6 +.Ltmp261: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q8, q3, q1 +.Ltmp262: + .loc 1 162 10 is_stmt 0 @ ../crypto/hrss/hrss.c:162:10 + vld1.16 {d20[], d21[]}, [r0:16] +.Ltmp263: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #704 +.Ltmp264: + .loc 1 167 10 is_stmt 1 @ ../crypto/hrss/hrss.c:167:10 + vext.16 q0, q1, q13, #7 +.Ltmp265: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmul.i16 q13, q3, q13 +.Ltmp266: + .loc 1 162 10 is_stmt 0 @ ../crypto/hrss/hrss.c:162:10 + vst1.64 {d20, d21}, [r0:128] @ 16-byte Spill + vmla.i16 q8, q10, q0 +.Ltmp267: + .loc 1 932 5 is_stmt 1 @ ../crypto/hrss/hrss.c:932:5 + add r0, sp, #624 +.Ltmp268: + .loc 1 169 10 @ ../crypto/hrss/hrss.c:169:10 + vext.16 q10, q9, q11, #7 + vorr q12, q10, q10 +.Ltmp269: + .loc 1 168 10 @ ../crypto/hrss/hrss.c:168:10 + vext.16 q15, q10, q1, #7 +.Ltmp270: + .loc 1 932 5 @ ../crypto/hrss/hrss.c:932:5 + vst1.64 {d24, d25}, [r0:128] @ 16-byte Spill + add.w r0, r8, #22 +.Ltmp271: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.16 {d14[], d15[]}, [r0:16] +.Ltmp272: + .loc 1 936 5 @ ../crypto/hrss/hrss.c:936:5 + add.w r0, r8, #8 +.Ltmp273: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q8, q7, q15 +.Ltmp274: + .loc 1 162 10 is_stmt 0 @ ../crypto/hrss/hrss.c:162:10 + vld1.16 {d20[], d21[]}, [r0:16] +.Ltmp275: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #656 +.Ltmp276: + .loc 1 167 10 is_stmt 1 @ ../crypto/hrss/hrss.c:167:10 + vext.16 q11, q15, q0, #7 +.Ltmp277: + .loc 1 169 10 @ ../crypto/hrss/hrss.c:169:10 + vext.16 q2, q9, q12, #7 +.Ltmp278: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q8, q10, q11 + vst1.64 {d20, d21}, [r0:128] @ 16-byte Spill +.Ltmp279: + .loc 1 937 5 @ ../crypto/hrss/hrss.c:937:5 + add r0, sp, #576 +.Ltmp280: + .loc 1 168 10 @ ../crypto/hrss/hrss.c:168:10 + vext.16 q10, q2, q15, #7 +.Ltmp281: + .loc 1 937 5 @ ../crypto/hrss/hrss.c:937:5 + vst1.64 {d4, d5}, [r0:128] @ 16-byte Spill + add.w r0, r8, #24 +.Ltmp282: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.16 {d24[], d25[]}, [r0:16] +.Ltmp283: + .loc 1 941 5 @ ../crypto/hrss/hrss.c:941:5 + add r0, sp, #688 +.Ltmp284: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q8, q12, q10 +.Ltmp285: + .loc 1 167 10 @ ../crypto/hrss/hrss.c:167:10 + vext.16 q5, q10, q11, #7 +.Ltmp286: + .loc 1 169 10 @ ../crypto/hrss/hrss.c:169:10 + vext.16 q6, q9, q2, #7 +.Ltmp287: + .loc 1 941 5 @ ../crypto/hrss/hrss.c:941:5 + vst1.64 {d10, d11}, [r0:128] @ 16-byte Spill + add.w r0, r8, #10 +.Ltmp288: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.16 {d28[], d29[]}, [r0:16] +.Ltmp289: + .loc 1 162 10 is_stmt 0 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #608 + vmla.i16 q8, q14, q5 + vst1.64 {d28, d29}, [r0:128] @ 16-byte Spill +.Ltmp290: + .loc 1 942 5 is_stmt 1 @ ../crypto/hrss/hrss.c:942:5 + add.w r0, r8, #26 +.Ltmp291: + .loc 1 168 10 @ ../crypto/hrss/hrss.c:168:10 + vext.16 q9, q6, q10, #7 +.Ltmp292: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.16 {d28[], d29[]}, [r0:16] +.Ltmp293: + .loc 1 162 10 is_stmt 0 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #592 + vmla.i16 q8, q14, q9 + vst1.64 {d28, d29}, [r0:128] @ 16-byte Spill + add r0, sp, #720 +.Ltmp294: + .loc 1 167 10 is_stmt 1 @ ../crypto/hrss/hrss.c:167:10 + vext.16 q14, q9, q5, #7 + vst1.64 {d28, d29}, [r0:128] @ 16-byte Spill +.Ltmp295: + .loc 1 946 5 @ ../crypto/hrss/hrss.c:946:5 + add.w r0, r8, #12 +.Ltmp296: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.16 {d4[], d5[]}, [r0:16] +.Ltmp297: + .loc 1 162 10 is_stmt 0 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #800 + vmla.i16 q8, q2, q14 + vst1.64 {d4, d5}, [r0:128] @ 16-byte Spill +.Ltmp298: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #832 + vld1.64 {d28, d29}, [r0:128] @ 16-byte Reload + add r0, sp, #640 + vld1.64 {d4, d5}, [r0:128] @ 16-byte Reload +.Ltmp299: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #816 +.Ltmp300: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q13, q14, q2 +.Ltmp301: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q13, q7, q0 +.Ltmp302: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q13, q12, q11 +.Ltmp303: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d22, d23}, [r0:128] @ 16-byte Reload + add r0, sp, #736 + vld1.64 {d28, d29}, [r0:128] @ 16-byte Reload +.Ltmp304: + .loc 1 159 59 is_stmt 1 @ ../crypto/hrss/hrss.c:159:59 + mov r0, r8 +.Ltmp305: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmul.i16 q0, q14, q11 +.Ltmp306: + .loc 1 159 59 @ ../crypto/hrss/hrss.c:159:59 + vld1.16 {d28[], d29[]}, [r0:16], r1 +.Ltmp307: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r1, sp, #736 + vst1.64 {d28, d29}, [r1:128] @ 16-byte Spill + add r1, sp, #768 + vld1.64 {d22, d23}, [r1:128] @ 16-byte Reload + add r1, sp, #784 + vmla.i16 q0, q14, q11 + vld1.64 {d10, d11}, [r1:128] @ 16-byte Reload + add r1, sp, #672 + vld1.64 {d22, d23}, [r1:128] @ 16-byte Reload +.Ltmp308: + .loc 1 162 10 is_stmt 0 @ ../crypto/hrss/hrss.c:162:10 + add r1, sp, #832 +.Ltmp309: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q0, q5, q4 +.Ltmp310: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d28, d29}, [r1:128] @ 16-byte Reload + add r1, sp, #752 + vld1.64 {d8, d9}, [r1:128] @ 16-byte Reload + add r1, sp, #624 + vld1.64 {d4, d5}, [r1:128] @ 16-byte Reload + add r1, sp, #704 + vmla.i16 q0, q14, q11 +.Ltmp311: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q0, q4, q1 + vld1.64 {d2, d3}, [r1:128] @ 16-byte Reload + add r1, sp, #576 +.Ltmp312: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q0, q3, q2 +.Ltmp313: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q0, q1, q15 + vld1.64 {d30, d31}, [r1:128] @ 16-byte Reload + add r1, sp, #656 +.Ltmp314: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q0, q7, q15 + vld1.64 {d14, d15}, [r1:128] @ 16-byte Reload + add r1, sp, #608 + vld1.64 {d6, d7}, [r1:128] @ 16-byte Reload + add r1, sp, #592 + vld1.64 {d28, d29}, [r1:128] @ 16-byte Reload +.Ltmp315: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r1, sp, #800 +.Ltmp316: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q0, q7, q10 +.Ltmp317: + .loc 1 169 10 is_stmt 1 @ ../crypto/hrss/hrss.c:169:10 + vmov.i32 q10, #0x0 +.Ltmp318: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q0, q12, q6 +.Ltmp319: + .loc 1 169 10 @ ../crypto/hrss/hrss.c:169:10 + vext.16 q10, q10, q6, #7 +.Ltmp320: + .loc 1 168 10 @ ../crypto/hrss/hrss.c:168:10 + vext.16 q12, q10, q9, #7 +.Ltmp321: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q0, q3, q9 +.Ltmp322: + .loc 1 162 10 is_stmt 0 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d18, d19}, [r1:128] @ 16-byte Reload +.Ltmp323: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r1, sp, #816 +.Ltmp324: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q0, q14, q10 +.Ltmp325: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q0, q9, q12 +.Ltmp326: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmul.i16 q9, q5, q11 + vld1.64 {d22, d23}, [r1:128] @ 16-byte Reload + add r1, sp, #736 + vld1.64 {d10, d11}, [r1:128] @ 16-byte Reload +.Ltmp327: + .loc 1 947 5 is_stmt 1 @ ../crypto/hrss/hrss.c:947:5 + add.w r1, r8, #28 +.Ltmp328: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q9, q5, q11 +.Ltmp329: + .loc 1 162 10 is_stmt 0 @ ../crypto/hrss/hrss.c:162:10 + vld1.16 {d22[], d23[]}, [r1:16] +.Ltmp330: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r1, sp, #688 +.Ltmp331: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q8, q11, q12 +.Ltmp332: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q9, q4, q2 +.Ltmp333: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q9, q1, q15 +.Ltmp334: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d30, d31}, [r1:128] @ 16-byte Reload +.Ltmp335: + .loc 1 951 5 is_stmt 1 @ ../crypto/hrss/hrss.c:951:5 + add.w r1, r8, #14 +.Ltmp336: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q13, q14, q15 +.Ltmp337: + .loc 1 169 10 @ ../crypto/hrss/hrss.c:169:10 + vmov.i32 q14, #0x0 +.Ltmp338: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q9, q7, q6 +.Ltmp339: + .loc 1 167 10 @ ../crypto/hrss/hrss.c:167:10 + vmov.i32 q1, #0x0 +.Ltmp340: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q9, q3, q10 +.Ltmp341: + .loc 1 169 10 @ ../crypto/hrss/hrss.c:169:10 + vext.16 q10, q14, q10, #7 +.Ltmp342: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.16 {d28[], d29[]}, [r1:16] + add r1, sp, #720 +.Ltmp343: + .loc 1 162 10 is_stmt 0 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q0, q11, q10 + vld1.64 {d6, d7}, [r1:128] @ 16-byte Reload +.Ltmp344: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r1, sp, #800 +.Ltmp345: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q13, q11, q3 +.Ltmp346: + .loc 1 167 10 is_stmt 1 @ ../crypto/hrss/hrss.c:167:10 + vext.16 q15, q12, q3, #7 + .loc 1 168 10 @ ../crypto/hrss/hrss.c:168:10 + vext.16 q11, q10, q12, #7 +.Ltmp347: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q8, q14, q15 +.Ltmp348: + .loc 1 162 10 is_stmt 0 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d24, d25}, [r1:128] @ 16-byte Reload +.Ltmp349: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q0, q14, q11 +.Ltmp350: + .loc 1 956 5 is_stmt 1 @ ../crypto/hrss/hrss.c:956:5 + mov r1, lr +.Ltmp351: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q9, q12, q10 +.Ltmp352: + .loc 1 162 10 is_stmt 0 @ ../crypto/hrss/hrss.c:162:10 + vld1.16 {d24[], d25[]}, [r0:16] +.Ltmp353: + .loc 1 956 5 is_stmt 1 @ ../crypto/hrss/hrss.c:956:5 + movs r0, #48 +.Ltmp354: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q13, q12, q15 +.Ltmp355: + .loc 1 169 10 @ ../crypto/hrss/hrss.c:169:10 + vext.16 q10, q1, q10, #7 +.Ltmp356: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q8, q12, q11 +.Ltmp357: + .loc 1 162 10 is_stmt 0 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q0, q12, q10 +.Ltmp358: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q9, q14, q10 +.Ltmp359: + .loc 1 956 5 is_stmt 1 @ ../crypto/hrss/hrss.c:956:5 + vst1.16 {d18, d19}, [r1:128], r0 + add.w r0, lr, #32 + vst1.64 {d26, d27}, [r1:128] + vst1.64 {d16, d17}, [r0:128] + add.w r0, lr, #16 + vst1.64 {d0, d1}, [r0:128] + b.w .LBB3_17 +.LBB3_3: + .loc 1 0 5 is_stmt 0 @ ../crypto/hrss/hrss.c:0:5 + movs r0, #32 + .loc 1 965 16 is_stmt 1 @ ../crypto/hrss/hrss.c:965:16 + add.w r1, r10, #16 + .loc 1 964 16 @ ../crypto/hrss/hrss.c:964:16 + vld1.16 {d22, d23}, [r10:128], r0 +.Ltmp360: + .loc 1 174 10 @ ../crypto/hrss/hrss.c:174:10 + vmov.i32 q8, #0x0 +.Ltmp361: + .loc 1 966 16 @ ../crypto/hrss/hrss.c:966:16 + add r0, sp, #752 +.Ltmp362: + .loc 1 174 10 @ ../crypto/hrss/hrss.c:174:10 + vmov.i32 q10, #0x0 +.Ltmp363: + .loc 1 965 16 @ ../crypto/hrss/hrss.c:965:16 + vld1.64 {d18, d19}, [r1:128] +.Ltmp364: + .loc 1 159 59 @ ../crypto/hrss/hrss.c:159:59 + mov r1, r8 + vorr q14, q9, q9 +.Ltmp365: + .loc 1 966 16 @ ../crypto/hrss/hrss.c:966:16 + vld1.64 {d6, d7}, [r10:128] +.Ltmp366: + .loc 1 174 10 @ ../crypto/hrss/hrss.c:174:10 + vext.16 q1, q3, q8, #7 + .loc 1 175 10 @ ../crypto/hrss/hrss.c:175:10 + vext.16 q15, q9, q3, #7 +.Ltmp367: + .loc 1 966 16 @ ../crypto/hrss/hrss.c:966:16 + vst1.64 {d22, d23}, [r0:128] @ 16-byte Spill + add r0, sp, #768 +.Ltmp368: + .loc 1 174 10 @ ../crypto/hrss/hrss.c:174:10 + vext.16 q8, q15, q1, #7 + vst1.64 {d30, d31}, [r0:128] @ 16-byte Spill +.Ltmp369: + .loc 1 1000 5 @ ../crypto/hrss/hrss.c:1000:5 + add r0, sp, #624 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + add.w r0, r8, #36 +.Ltmp370: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.16 {d24[], d25[]}, [r0:16] +.Ltmp371: + .loc 1 162 10 is_stmt 0 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #832 + vmul.i16 q2, q12, q8 + vst1.64 {d24, d25}, [r0:128] @ 16-byte Spill +.Ltmp372: + .loc 1 994 5 is_stmt 1 @ ../crypto/hrss/hrss.c:994:5 + add.w r0, r8, #34 +.Ltmp373: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.16 {d24[], d25[]}, [r0:16] +.Ltmp374: + .loc 1 162 10 is_stmt 0 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #256 + vmla.i16 q2, q12, q1 + vst1.64 {d24, d25}, [r0:128] @ 16-byte Spill + add r0, sp, #64 +.Ltmp375: + .loc 1 176 10 is_stmt 1 @ ../crypto/hrss/hrss.c:176:10 + vext.16 q13, q11, q9, #7 + vst1.64 {d2, d3}, [r0:128] @ 16-byte Spill +.Ltmp376: + .loc 1 175 10 @ ../crypto/hrss/hrss.c:175:10 + add r0, sp, #704 + vext.16 q6, q13, q15, #7 + vorr q15, q13, q13 + vst1.64 {d28, d29}, [r0:128] @ 16-byte Spill +.Ltmp377: + .loc 1 1006 5 @ ../crypto/hrss/hrss.c:1006:5 + add.w r0, r8, #38 +.Ltmp378: + .loc 1 177 10 @ ../crypto/hrss/hrss.c:177:10 + vext.16 q12, q10, q11, #7 +.Ltmp379: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.16 {d18[], d19[]}, [r0:16] +.Ltmp380: + .loc 1 162 10 is_stmt 0 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #320 +.Ltmp381: + .loc 1 174 10 is_stmt 1 @ ../crypto/hrss/hrss.c:174:10 + vext.16 q8, q6, q8, #7 +.Ltmp382: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vst1.64 {d18, d19}, [r0:128] @ 16-byte Spill + vmla.i16 q2, q9, q8 +.Ltmp383: + .loc 1 176 10 @ ../crypto/hrss/hrss.c:176:10 + add r0, sp, #160 + vext.16 q9, q12, q13, #7 + vorr q7, q8, q8 + vst1.64 {d24, d25}, [r0:128] @ 16-byte Spill +.Ltmp384: + .loc 1 175 10 @ ../crypto/hrss/hrss.c:175:10 + add r0, sp, #672 + vorr q4, q9, q9 + vext.16 q13, q9, q6, #7 + vst1.64 {d30, d31}, [r0:128] @ 16-byte Spill + add r0, sp, #816 + vst1.64 {d26, d27}, [r0:128] @ 16-byte Spill +.Ltmp385: + .loc 1 174 10 @ ../crypto/hrss/hrss.c:174:10 + add r0, sp, #416 + vst1.64 {d12, d13}, [r0:128] @ 16-byte Spill +.Ltmp386: + .loc 1 1012 5 @ ../crypto/hrss/hrss.c:1012:5 + add r0, sp, #48 + vst1.64 {d14, d15}, [r0:128] @ 16-byte Spill + add.w r0, r8, #40 +.Ltmp387: + .loc 1 174 10 @ ../crypto/hrss/hrss.c:174:10 + vext.16 q0, q13, q8, #7 +.Ltmp388: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.16 {d22[], d23[]}, [r0:16] + add r0, sp, #352 + vorr q8, q0, q0 +.Ltmp389: + .loc 1 162 10 is_stmt 0 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q2, q11, q0 + vst1.64 {d22, d23}, [r0:128] @ 16-byte Spill + add r0, sp, #544 +.Ltmp390: + .loc 1 177 10 is_stmt 1 @ ../crypto/hrss/hrss.c:177:10 + vext.16 q11, q10, q12, #7 +.Ltmp391: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill +.Ltmp392: + .loc 1 176 10 @ ../crypto/hrss/hrss.c:176:10 + add r0, sp, #736 + vext.16 q12, q11, q9, #7 + vst1.64 {d22, d23}, [r0:128] @ 16-byte Spill +.Ltmp393: + .loc 1 175 10 @ ../crypto/hrss/hrss.c:175:10 + add r0, sp, #640 + vorr q5, q12, q12 + vext.16 q0, q12, q13, #7 + vst1.64 {d8, d9}, [r0:128] @ 16-byte Spill +.Ltmp394: + .loc 1 174 10 @ ../crypto/hrss/hrss.c:174:10 + add r0, sp, #272 +.Ltmp395: + .loc 1 177 10 @ ../crypto/hrss/hrss.c:177:10 + vext.16 q9, q10, q11, #7 +.Ltmp396: + .loc 1 174 10 @ ../crypto/hrss/hrss.c:174:10 + vst1.64 {d0, d1}, [r0:128] @ 16-byte Spill +.Ltmp397: + .loc 1 1018 5 @ ../crypto/hrss/hrss.c:1018:5 + add.w r0, r8, #42 +.Ltmp398: + .loc 1 174 10 @ ../crypto/hrss/hrss.c:174:10 + vext.16 q13, q0, q8, #7 +.Ltmp399: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.16 {d16[], d17[]}, [r0:16] + add r0, sp, #384 +.Ltmp400: + .loc 1 162 10 is_stmt 0 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q2, q8, q13 +.Ltmp401: + .loc 1 176 10 is_stmt 1 @ ../crypto/hrss/hrss.c:176:10 + vext.16 q11, q9, q12, #7 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill +.Ltmp402: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #496 +.Ltmp403: + .loc 1 175 10 @ ../crypto/hrss/hrss.c:175:10 + vext.16 q12, q11, q0, #7 +.Ltmp404: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vst1.64 {d26, d27}, [r0:128] @ 16-byte Spill +.Ltmp405: + .loc 1 176 10 @ ../crypto/hrss/hrss.c:176:10 + add r0, sp, #720 + vorr q0, q12, q12 + vst1.64 {d18, d19}, [r0:128] @ 16-byte Spill +.Ltmp406: + .loc 1 175 10 @ ../crypto/hrss/hrss.c:175:10 + add r0, sp, #656 +.Ltmp407: + .loc 1 174 10 @ ../crypto/hrss/hrss.c:174:10 + vext.16 q12, q12, q13, #7 +.Ltmp408: + .loc 1 175 10 @ ../crypto/hrss/hrss.c:175:10 + vst1.64 {d10, d11}, [r0:128] @ 16-byte Spill +.Ltmp409: + .loc 1 1024 5 @ ../crypto/hrss/hrss.c:1024:5 + add r0, sp, #464 + vst1.64 {d24, d25}, [r0:128] @ 16-byte Spill + add.w r0, r8, #44 +.Ltmp410: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.16 {d16[], d17[]}, [r0:16] + add r0, sp, #400 +.Ltmp411: + .loc 1 162 10 is_stmt 0 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q2, q8, q12 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill +.Ltmp412: + .loc 1 176 10 is_stmt 1 @ ../crypto/hrss/hrss.c:176:10 + add r0, sp, #784 +.Ltmp413: + .loc 1 177 10 @ ../crypto/hrss/hrss.c:177:10 + vext.16 q8, q10, q9, #7 +.Ltmp414: + .loc 1 176 10 @ ../crypto/hrss/hrss.c:176:10 + vext.16 q9, q8, q11, #7 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + add r0, sp, #800 + vorr q8, q11, q11 + vst1.64 {d18, d19}, [r0:128] @ 16-byte Spill + add r0, sp, #592 +.Ltmp415: + .loc 1 175 10 @ ../crypto/hrss/hrss.c:175:10 + vext.16 q10, q9, q0, #7 + vorr q9, q0, q0 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + add r0, sp, #304 + vst1.64 {d20, d21}, [r0:128] @ 16-byte Spill +.Ltmp416: + .loc 1 1030 5 @ ../crypto/hrss/hrss.c:1030:5 + movs r0, #46 +.Ltmp417: + .loc 1 159 59 @ ../crypto/hrss/hrss.c:159:59 + vld1.16 {d22[], d23[]}, [r1:16], r0 +.Ltmp418: + .loc 1 174 10 @ ../crypto/hrss/hrss.c:174:10 + add r0, sp, #512 + vst1.64 {d22, d23}, [r0:128] @ 16-byte Spill + add r0, sp, #368 + vext.16 q11, q10, q12, #7 +.Ltmp419: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.16 {d20[], d21[]}, [r1:16] +.Ltmp420: + .loc 1 1035 5 @ ../crypto/hrss/hrss.c:1035:5 + mov r1, lr +.Ltmp421: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q2, q10, q11 + vst1.64 {d22, d23}, [r0:128] @ 16-byte Spill + add r0, sp, #176 + vst1.64 {d20, d21}, [r0:128] @ 16-byte Spill + add r0, sp, #144 + vst1.64 {d4, d5}, [r0:128] @ 16-byte Spill +.Ltmp422: + .loc 1 980 5 @ ../crypto/hrss/hrss.c:980:5 + add.w r0, r8, #32 +.Ltmp423: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.16 {d20[], d21[]}, [r0:16] +.Ltmp424: + .loc 1 162 10 is_stmt 0 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #480 + vmul.i16 q13, q10, q14 + vst1.64 {d20, d21}, [r0:128] @ 16-byte Spill +.Ltmp425: + .loc 1 979 5 is_stmt 1 @ ../crypto/hrss/hrss.c:979:5 + add.w r0, r8, #16 + vorr q14, q3, q3 +.Ltmp426: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.16 {d20[], d21[]}, [r0:16] +.Ltmp427: + .loc 1 162 10 is_stmt 0 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #688 + vmla.i16 q13, q10, q3 + vst1.64 {d20, d21}, [r0:128] @ 16-byte Spill +.Ltmp428: + .loc 1 992 5 is_stmt 1 @ ../crypto/hrss/hrss.c:992:5 + add.w r0, r8, #2 +.Ltmp429: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.16 {d20[], d21[]}, [r0:16] +.Ltmp430: + .loc 1 162 10 is_stmt 0 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #128 + vmla.i16 q13, q10, q1 + vst1.64 {d20, d21}, [r0:128] @ 16-byte Spill +.Ltmp431: + .loc 1 993 5 is_stmt 1 @ ../crypto/hrss/hrss.c:993:5 + add.w r0, r8, #18 +.Ltmp432: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.16 {d20[], d21[]}, [r0:16] + add r0, sp, #288 + vst1.64 {d20, d21}, [r0:128] @ 16-byte Spill +.Ltmp433: + .loc 1 162 10 is_stmt 0 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #768 + vld1.64 {d22, d23}, [r0:128] @ 16-byte Reload + add r0, sp, #256 + vmla.i16 q13, q10, q11 + vld1.64 {d22, d23}, [r0:128] @ 16-byte Reload +.Ltmp434: + .loc 1 998 5 is_stmt 1 @ ../crypto/hrss/hrss.c:998:5 + add.w r0, r8, #4 +.Ltmp435: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.16 {d24[], d25[]}, [r0:16] +.Ltmp436: + .loc 1 162 10 is_stmt 0 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #432 +.Ltmp437: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q13, q11, q15 +.Ltmp438: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vst1.64 {d24, d25}, [r0:128] @ 16-byte Spill + add r0, sp, #624 + vld1.64 {d20, d21}, [r0:128] @ 16-byte Reload +.Ltmp439: + .loc 1 999 5 is_stmt 1 @ ../crypto/hrss/hrss.c:999:5 + add.w r0, r8, #20 +.Ltmp440: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q13, q12, q10 +.Ltmp441: + .loc 1 162 10 is_stmt 0 @ ../crypto/hrss/hrss.c:162:10 + vld1.16 {d20[], d21[]}, [r0:16] +.Ltmp442: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #832 + vorr q15, q10, q10 +.Ltmp443: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q13, q10, q6 +.Ltmp444: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d20, d21}, [r0:128] @ 16-byte Reload +.Ltmp445: + .loc 1 1004 5 is_stmt 1 @ ../crypto/hrss/hrss.c:1004:5 + add.w r0, r8, #6 + vorr q6, q14, q14 +.Ltmp446: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q13, q10, q4 +.Ltmp447: + .loc 1 162 10 is_stmt 0 @ ../crypto/hrss/hrss.c:162:10 + vld1.16 {d20[], d21[]}, [r0:16] +.Ltmp448: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #608 + vst1.64 {d20, d21}, [r0:128] @ 16-byte Spill +.Ltmp449: + .loc 1 1005 5 is_stmt 1 @ ../crypto/hrss/hrss.c:1005:5 + add.w r0, r8, #22 +.Ltmp450: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q13, q10, q7 +.Ltmp451: + .loc 1 162 10 is_stmt 0 @ ../crypto/hrss/hrss.c:162:10 + vld1.16 {d20[], d21[]}, [r0:16] +.Ltmp452: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #240 + vorr q7, q9, q9 + vst1.64 {d20, d21}, [r0:128] @ 16-byte Spill + add r0, sp, #816 + vld1.64 {d24, d25}, [r0:128] @ 16-byte Reload + add r0, sp, #320 + vmla.i16 q13, q10, q12 + vld1.64 {d4, d5}, [r0:128] @ 16-byte Reload +.Ltmp453: + .loc 1 1010 5 is_stmt 1 @ ../crypto/hrss/hrss.c:1010:5 + add.w r0, r8, #8 +.Ltmp454: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.16 {d24[], d25[]}, [r0:16] +.Ltmp455: + .loc 1 162 10 is_stmt 0 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #96 +.Ltmp456: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q13, q2, q5 +.Ltmp457: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vst1.64 {d24, d25}, [r0:128] @ 16-byte Spill + add r0, sp, #544 + vld1.64 {d20, d21}, [r0:128] @ 16-byte Reload +.Ltmp458: + .loc 1 1011 5 is_stmt 1 @ ../crypto/hrss/hrss.c:1011:5 + add.w r0, r8, #24 +.Ltmp459: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q13, q12, q10 +.Ltmp460: + .loc 1 162 10 is_stmt 0 @ ../crypto/hrss/hrss.c:162:10 + vld1.16 {d20[], d21[]}, [r0:16] + add r0, sp, #224 + vst1.64 {d20, d21}, [r0:128] @ 16-byte Spill + add r0, sp, #272 + vld1.64 {d10, d11}, [r0:128] @ 16-byte Reload + add r0, sp, #352 +.Ltmp461: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q13, q10, q5 + vld1.64 {d0, d1}, [r0:128] @ 16-byte Reload +.Ltmp462: + .loc 1 1016 5 is_stmt 1 @ ../crypto/hrss/hrss.c:1016:5 + add.w r0, r8, #10 +.Ltmp463: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q13, q0, q8 +.Ltmp464: + .loc 1 162 10 is_stmt 0 @ ../crypto/hrss/hrss.c:162:10 + vld1.16 {d16[], d17[]}, [r0:16] +.Ltmp465: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #576 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + add r0, sp, #496 + vld1.64 {d20, d21}, [r0:128] @ 16-byte Reload +.Ltmp466: + .loc 1 1017 5 is_stmt 1 @ ../crypto/hrss/hrss.c:1017:5 + add.w r0, r8, #26 +.Ltmp467: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q13, q8, q10 +.Ltmp468: + .loc 1 162 10 is_stmt 0 @ ../crypto/hrss/hrss.c:162:10 + vld1.16 {d16[], d17[]}, [r0:16] + add r0, sp, #208 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill +.Ltmp469: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #16 + vmla.i16 q13, q8, q9 + vst1.64 {d14, d15}, [r0:128] @ 16-byte Spill + add r0, sp, #800 + vld1.64 {d18, d19}, [r0:128] @ 16-byte Reload + add r0, sp, #384 + vld1.64 {d8, d9}, [r0:128] @ 16-byte Reload +.Ltmp470: + .loc 1 1022 5 is_stmt 1 @ ../crypto/hrss/hrss.c:1022:5 + add.w r0, r8, #12 +.Ltmp471: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q13, q4, q9 +.Ltmp472: + .loc 1 162 10 is_stmt 0 @ ../crypto/hrss/hrss.c:162:10 + vld1.16 {d16[], d17[]}, [r0:16] +.Ltmp473: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #528 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + add r0, sp, #464 + vld1.64 {d20, d21}, [r0:128] @ 16-byte Reload +.Ltmp474: + .loc 1 1023 5 is_stmt 1 @ ../crypto/hrss/hrss.c:1023:5 + add.w r0, r8, #28 +.Ltmp475: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q13, q8, q10 +.Ltmp476: + .loc 1 162 10 is_stmt 0 @ ../crypto/hrss/hrss.c:162:10 + vld1.16 {d16[], d17[]}, [r0:16] + add r0, sp, #192 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + add r0, sp, #304 + vld1.64 {d6, d7}, [r0:128] @ 16-byte Reload +.Ltmp477: + .loc 1 177 10 is_stmt 1 @ ../crypto/hrss/hrss.c:177:10 + add r0, sp, #784 +.Ltmp478: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q13, q8, q3 +.Ltmp479: + .loc 1 177 10 @ ../crypto/hrss/hrss.c:177:10 + vld1.64 {d20, d21}, [r0:128] @ 16-byte Reload +.Ltmp480: + .loc 1 176 10 @ ../crypto/hrss/hrss.c:176:10 + add r0, sp, #336 +.Ltmp481: + .loc 1 177 10 @ ../crypto/hrss/hrss.c:177:10 + vmov.i32 q8, #0x0 + vext.16 q8, q8, q10, #7 +.Ltmp482: + .loc 1 176 10 @ ../crypto/hrss/hrss.c:176:10 + vext.16 q10, q8, q9, #7 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill + add r0, sp, #448 + vst1.64 {d20, d21}, [r0:128] @ 16-byte Spill + add r0, sp, #400 + vld1.64 {d18, d19}, [r0:128] @ 16-byte Reload +.Ltmp483: + .loc 1 1028 5 @ ../crypto/hrss/hrss.c:1028:5 + add.w r0, r8, #14 +.Ltmp484: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q13, q9, q10 +.Ltmp485: + .loc 1 162 10 is_stmt 0 @ ../crypto/hrss/hrss.c:162:10 + vld1.16 {d16[], d17[]}, [r0:16] + add r0, sp, #80 + vst1.64 {d16, d17}, [r0:128] @ 16-byte Spill +.Ltmp486: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #368 +.Ltmp487: + .loc 1 175 10 is_stmt 1 @ ../crypto/hrss/hrss.c:175:10 + vext.16 q10, q10, q3, #7 +.Ltmp488: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d24, d25}, [r0:128] @ 16-byte Reload + add r0, sp, #560 + vmla.i16 q13, q8, q12 + vst1.64 {d20, d21}, [r0:128] @ 16-byte Spill +.Ltmp489: + .loc 1 1029 5 @ ../crypto/hrss/hrss.c:1029:5 + add.w r0, r8, #30 +.Ltmp490: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.16 {d24[], d25[]}, [r0:16] + add r0, sp, #112 +.Ltmp491: + .loc 1 162 10 is_stmt 0 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q13, q12, q10 + vst1.64 {d24, d25}, [r0:128] @ 16-byte Spill + add r0, sp, #288 + vld1.64 {d20, d21}, [r0:128] @ 16-byte Reload +.Ltmp492: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #64 + vld1.64 {d16, d17}, [r0:128] @ 16-byte Reload + add r0, sp, #480 + vmul.i16 q8, q10, q8 + vld1.64 {d28, d29}, [r0:128] @ 16-byte Reload +.Ltmp493: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #768 +.Ltmp494: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q8, q14, q6 +.Ltmp495: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d28, d29}, [r0:128] @ 16-byte Reload +.Ltmp496: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #32 +.Ltmp497: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q8, q11, q14 + vorr q11, q15, q15 +.Ltmp498: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vst1.64 {d22, d23}, [r0:128] @ 16-byte Spill + add r0, sp, #624 + vld1.64 {d28, d29}, [r0:128] @ 16-byte Reload +.Ltmp499: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #416 +.Ltmp500: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q8, q15, q14 +.Ltmp501: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d28, d29}, [r0:128] @ 16-byte Reload + add r0, sp, #832 + vld1.64 {d30, d31}, [r0:128] @ 16-byte Reload + add r0, sp, #240 + vmla.i16 q8, q15, q14 + vld1.64 {d28, d29}, [r0:128] @ 16-byte Reload +.Ltmp502: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #48 + vld1.64 {d30, d31}, [r0:128] @ 16-byte Reload +.Ltmp503: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #816 +.Ltmp504: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q8, q14, q15 +.Ltmp505: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d30, d31}, [r0:128] @ 16-byte Reload + add r0, sp, #224 + vmla.i16 q8, q2, q15 + vld1.64 {d30, d31}, [r0:128] @ 16-byte Reload +.Ltmp506: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #544 + vld1.64 {d4, d5}, [r0:128] @ 16-byte Reload + add r0, sp, #208 + vmla.i16 q8, q15, q2 +.Ltmp507: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q8, q0, q5 + vld1.64 {d0, d1}, [r0:128] @ 16-byte Reload +.Ltmp508: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #496 + vld1.64 {d4, d5}, [r0:128] @ 16-byte Reload + add r0, sp, #192 + vld1.64 {d2, d3}, [r0:128] @ 16-byte Reload +.Ltmp509: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #464 +.Ltmp510: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q8, q0, q2 +.Ltmp511: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d4, d5}, [r0:128] @ 16-byte Reload +.Ltmp512: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #368 +.Ltmp513: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q8, q4, q7 +.Ltmp514: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q8, q1, q2 +.Ltmp515: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q8, q9, q3 +.Ltmp516: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d18, d19}, [r0:128] @ 16-byte Reload +.Ltmp517: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #704 +.Ltmp518: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q8, q12, q9 +.Ltmp519: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d18, d19}, [r0:128] @ 16-byte Reload + add r0, sp, #688 + vld1.64 {d24, d25}, [r0:128] @ 16-byte Reload + add r0, sp, #512 + vmul.i16 q12, q12, q9 + vld1.64 {d10, d11}, [r0:128] @ 16-byte Reload + add r0, sp, #752 + vld1.64 {d6, d7}, [r0:128] @ 16-byte Reload +.Ltmp520: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #480 +.Ltmp521: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q12, q5, q6 +.Ltmp522: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d18, d19}, [r0:128] @ 16-byte Reload + add r0, sp, #128 + vld1.64 {d8, d9}, [r0:128] @ 16-byte Reload +.Ltmp523: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #768 +.Ltmp524: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q12, q9, q3 +.Ltmp525: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d18, d19}, [r0:128] @ 16-byte Reload +.Ltmp526: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #672 +.Ltmp527: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q12, q4, q9 +.Ltmp528: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d18, d19}, [r0:128] @ 16-byte Reload + add r0, sp, #160 + vld1.64 {d12, d13}, [r0:128] @ 16-byte Reload +.Ltmp529: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #256 +.Ltmp530: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q12, q10, q9 +.Ltmp531: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d18, d19}, [r0:128] @ 16-byte Reload + add r0, sp, #432 + vld1.64 {d4, d5}, [r0:128] @ 16-byte Reload +.Ltmp532: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #416 +.Ltmp533: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q12, q9, q6 +.Ltmp534: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d18, d19}, [r0:128] @ 16-byte Reload +.Ltmp535: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #640 +.Ltmp536: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q12, q2, q9 +.Ltmp537: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d18, d19}, [r0:128] @ 16-byte Reload + add r0, sp, #736 + vld1.64 {d20, d21}, [r0:128] @ 16-byte Reload +.Ltmp538: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #832 +.Ltmp539: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q12, q11, q9 +.Ltmp540: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d18, d19}, [r0:128] @ 16-byte Reload +.Ltmp541: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #608 +.Ltmp542: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q12, q9, q10 +.Ltmp543: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d18, d19}, [r0:128] @ 16-byte Reload + add r0, sp, #816 + vld1.64 {d22, d23}, [r0:128] @ 16-byte Reload +.Ltmp544: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #656 +.Ltmp545: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q12, q9, q11 +.Ltmp546: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d18, d19}, [r0:128] @ 16-byte Reload + add r0, sp, #720 + vmla.i16 q12, q14, q9 + vld1.64 {d28, d29}, [r0:128] @ 16-byte Reload +.Ltmp547: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #320 + vld1.64 {d18, d19}, [r0:128] @ 16-byte Reload + add r0, sp, #96 + vld1.64 {d14, d15}, [r0:128] @ 16-byte Reload +.Ltmp548: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #272 +.Ltmp549: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q12, q9, q14 +.Ltmp550: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d18, d19}, [r0:128] @ 16-byte Reload +.Ltmp551: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #592 +.Ltmp552: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q12, q7, q9 +.Ltmp553: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d18, d19}, [r0:128] @ 16-byte Reload +.Ltmp554: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #784 +.Ltmp555: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q12, q15, q9 +.Ltmp556: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d18, d19}, [r0:128] @ 16-byte Reload + add r0, sp, #352 + vld1.64 {d22, d23}, [r0:128] @ 16-byte Reload +.Ltmp557: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #576 +.Ltmp558: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q12, q11, q9 +.Ltmp559: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d18, d19}, [r0:128] @ 16-byte Reload + add r0, sp, #16 + vld1.64 {d22, d23}, [r0:128] @ 16-byte Reload +.Ltmp560: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #800 +.Ltmp561: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q12, q9, q11 +.Ltmp562: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d18, d19}, [r0:128] @ 16-byte Reload + add r0, sp, #336 + vld1.64 {d22, d23}, [r0:128] @ 16-byte Reload +.Ltmp563: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #384 +.Ltmp564: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q12, q0, q9 +.Ltmp565: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d18, d19}, [r0:128] @ 16-byte Reload +.Ltmp566: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #528 +.Ltmp567: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q12, q9, q11 +.Ltmp568: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d18, d19}, [r0:128] @ 16-byte Reload + add r0, sp, #304 + vld1.64 {d30, d31}, [r0:128] @ 16-byte Reload +.Ltmp569: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #448 +.Ltmp570: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q12, q9, q15 +.Ltmp571: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d18, d19}, [r0:128] @ 16-byte Reload +.Ltmp572: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #400 +.Ltmp573: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q12, q1, q9 +.Ltmp574: + .loc 1 174 10 is_stmt 1 @ ../crypto/hrss/hrss.c:174:10 + vmov.i32 q9, #0x0 +.Ltmp575: + .loc 1 177 10 @ ../crypto/hrss/hrss.c:177:10 + vext.16 q1, q9, q11, #7 +.Ltmp576: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d18, d19}, [r0:128] @ 16-byte Reload + add r0, sp, #80 + vmla.i16 q12, q9, q1 + vld1.64 {d18, d19}, [r0:128] @ 16-byte Reload +.Ltmp577: + .loc 1 162 10 is_stmt 0 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #560 + vld1.64 {d22, d23}, [r0:128] @ 16-byte Reload +.Ltmp578: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #688 +.Ltmp579: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q12, q9, q11 +.Ltmp580: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d22, d23}, [r0:128] @ 16-byte Reload + add r0, sp, #704 + vmul.i16 q0, q11, q3 + vld1.64 {d22, d23}, [r0:128] @ 16-byte Reload +.Ltmp581: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #672 + vorr q3, q4, q4 +.Ltmp582: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q0, q5, q11 +.Ltmp583: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d22, d23}, [r0:128] @ 16-byte Reload +.Ltmp584: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #288 + vorr q5, q7, q7 +.Ltmp585: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q0, q4, q11 +.Ltmp586: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d22, d23}, [r0:128] @ 16-byte Reload +.Ltmp587: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #640 +.Ltmp588: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q0, q11, q6 +.Ltmp589: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d22, d23}, [r0:128] @ 16-byte Reload +.Ltmp590: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #32 +.Ltmp591: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q0, q2, q11 +.Ltmp592: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d22, d23}, [r0:128] @ 16-byte Reload + add r0, sp, #608 + vld1.64 {d4, d5}, [r0:128] @ 16-byte Reload +.Ltmp593: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #656 +.Ltmp594: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q0, q11, q10 +.Ltmp595: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d20, d21}, [r0:128] @ 16-byte Reload +.Ltmp596: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #240 +.Ltmp597: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q0, q2, q10 +.Ltmp598: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d20, d21}, [r0:128] @ 16-byte Reload +.Ltmp599: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #592 +.Ltmp600: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q0, q10, q14 +.Ltmp601: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d20, d21}, [r0:128] @ 16-byte Reload + add r0, sp, #784 + vld1.64 {d30, d31}, [r0:128] @ 16-byte Reload +.Ltmp602: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #224 +.Ltmp603: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q0, q7, q10 +.Ltmp604: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d20, d21}, [r0:128] @ 16-byte Reload + add r0, sp, #576 + vld1.64 {d14, d15}, [r0:128] @ 16-byte Reload +.Ltmp605: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #800 +.Ltmp606: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q0, q10, q15 +.Ltmp607: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d20, d21}, [r0:128] @ 16-byte Reload + add r0, sp, #336 + vld1.64 {d22, d23}, [r0:128] @ 16-byte Reload +.Ltmp608: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #208 +.Ltmp609: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q0, q7, q10 +.Ltmp610: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d20, d21}, [r0:128] @ 16-byte Reload + add r0, sp, #528 + vld1.64 {d8, d9}, [r0:128] @ 16-byte Reload + add r0, sp, #448 + vmla.i16 q0, q10, q11 + vld1.64 {d20, d21}, [r0:128] @ 16-byte Reload +.Ltmp611: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #192 + vld1.64 {d28, d29}, [r0:128] @ 16-byte Reload +.Ltmp612: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #752 +.Ltmp613: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q0, q4, q10 +.Ltmp614: + .loc 1 176 10 is_stmt 1 @ ../crypto/hrss/hrss.c:176:10 + vext.16 q10, q1, q10, #7 +.Ltmp615: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q0, q14, q1 +.Ltmp616: + .loc 1 162 10 is_stmt 0 @ ../crypto/hrss/hrss.c:162:10 + vmul.i16 q14, q3, q6 + vld1.64 {d12, d13}, [r0:128] @ 16-byte Reload + add r0, sp, #512 + vld1.64 {d6, d7}, [r0:128] @ 16-byte Reload +.Ltmp617: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #432 +.Ltmp618: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q0, q9, q10 +.Ltmp619: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q14, q3, q6 +.Ltmp620: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d6, d7}, [r0:128] @ 16-byte Reload + add r0, sp, #736 + vld1.64 {d12, d13}, [r0:128] @ 16-byte Reload +.Ltmp621: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #720 +.Ltmp622: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q14, q3, q6 +.Ltmp623: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d6, d7}, [r0:128] @ 16-byte Reload + add r0, sp, #176 + vmla.i16 q14, q2, q3 +.Ltmp624: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q14, q5, q15 + vld1.64 {d30, d31}, [r0:128] @ 16-byte Reload +.Ltmp625: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + add r0, sp, #560 +.Ltmp626: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q13, q15, q10 +.Ltmp627: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q14, q7, q11 +.Ltmp628: + .loc 1 177 10 is_stmt 1 @ ../crypto/hrss/hrss.c:177:10 + vmov.i32 q11, #0x0 +.Ltmp629: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q14, q4, q1 +.Ltmp630: + .loc 1 177 10 @ ../crypto/hrss/hrss.c:177:10 + vext.16 q11, q11, q1, #7 +.Ltmp631: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vld1.64 {d2, d3}, [r0:128] @ 16-byte Reload + add r0, sp, #112 + vmla.i16 q8, q15, q1 +.Ltmp632: + .loc 1 162 10 is_stmt 0 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q14, q9, q11 + vld1.64 {d18, d19}, [r0:128] @ 16-byte Reload +.Ltmp633: + .loc 1 1035 5 is_stmt 1 @ ../crypto/hrss/hrss.c:1035:5 + movs r0, #80 +.Ltmp634: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q12, q9, q10 +.Ltmp635: + .loc 1 1035 5 @ ../crypto/hrss/hrss.c:1035:5 + vst1.16 {d28, d29}, [r1:128], r0 + add r0, sp, #144 +.Ltmp636: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q0, q9, q11 +.Ltmp637: + .loc 1 1035 5 @ ../crypto/hrss/hrss.c:1035:5 + vld1.64 {d18, d19}, [r0:128] @ 16-byte Reload + add.w r0, lr, #64 + vst1.64 {d18, d19}, [r1:128] +.Ltmp638: + .loc 1 162 10 @ ../crypto/hrss/hrss.c:162:10 + vmla.i16 q12, q15, q11 +.Ltmp639: + .loc 1 1035 5 @ ../crypto/hrss/hrss.c:1035:5 + vst1.64 {d16, d17}, [r0:128] + add.w r0, lr, #16 + vst1.64 {d0, d1}, [r0:128] + add.w r0, lr, #48 + vst1.64 {d26, d27}, [r0:128] + add.w r0, lr, #32 + vst1.64 {d24, d25}, [r0:128] + b .LBB3_17 +.LBB3_4: + .loc 1 1045 28 @ ../crypto/hrss/hrss.c:1045:28 + lsr.w r11, r1, #1 + .loc 1 1048 26 @ ../crypto/hrss/hrss.c:1048:26 + add.w r0, r8, r11, lsl #4 + .loc 1 1047 26 @ ../crypto/hrss/hrss.c:1047:26 + str r0, [sp, #816] @ 4-byte Spill + add.w r0, r10, r11, lsl #4 + .loc 1 1046 29 @ ../crypto/hrss/hrss.c:1046:29 + str r0, [sp, #800] @ 4-byte Spill + sub.w r0, r1, r1, lsr #1 + str r0, [sp, #832] @ 4-byte Spill + movs r0, #0 + .loc 1 1052 3 @ ../crypto/hrss/hrss.c:1052:3 + cmp.w r0, r1, lsr #1 + beq .LBB3_7 +@ %bb.5: + .loc 1 1053 22 @ ../crypto/hrss/hrss.c:1053:22 + ldr r0, [r7, #8] + lsl.w r1, r11, #4 + mov r2, r11 + mov r3, r8 + mov r4, lr + mov r5, r10 + lsls r0, r0, #4 + sub.w r12, r0, r11, lsl #4 +.LBB3_6: @ =>This Inner Loop Header: Depth=1 + adds r0, r5, r1 + .loc 1 1053 33 is_stmt 0 @ ../crypto/hrss/hrss.c:1053:33 + vld1.16 {d16, d17}, [r5:128]! + .loc 1 1054 33 is_stmt 1 @ ../crypto/hrss/hrss.c:1054:33 + adds r6, r3, r1 + .loc 1 1052 24 @ ../crypto/hrss/hrss.c:1052:24 + subs r2, #1 + .loc 1 1053 22 @ ../crypto/hrss/hrss.c:1053:22 + vld1.64 {d18, d19}, [r0:128] + .loc 1 1054 23 @ ../crypto/hrss/hrss.c:1054:23 + add.w r0, r4, r12 +.Ltmp640: + .loc 1 155 58 @ ../crypto/hrss/hrss.c:155:58 + vadd.i16 q8, q8, q9 +.Ltmp641: + .loc 1 1053 12 @ ../crypto/hrss/hrss.c:1053:12 + vst1.16 {d16, d17}, [r4:128]! + .loc 1 1054 44 @ ../crypto/hrss/hrss.c:1054:44 + vld1.16 {d18, d19}, [r3:128]! + .loc 1 1054 33 is_stmt 0 @ ../crypto/hrss/hrss.c:1054:33 + vld1.64 {d16, d17}, [r6:128] +.Ltmp642: + .loc 1 155 58 is_stmt 1 @ ../crypto/hrss/hrss.c:155:58 + vadd.i16 q8, q9, q8 +.Ltmp643: + .loc 1 1054 23 @ ../crypto/hrss/hrss.c:1054:23 + vst1.64 {d16, d17}, [r0:128] + .loc 1 1052 3 @ ../crypto/hrss/hrss.c:1052:3 + bne .LBB3_6 +.LBB3_7: + .loc 1 1056 7 @ ../crypto/hrss/hrss.c:1056:7 + ldr r0, [sp, #832] @ 4-byte Reload + ldr r4, [r7, #8] + cmp r0, r11 + beq .LBB3_9 +@ %bb.8: + .loc 1 1057 20 @ ../crypto/hrss/hrss.c:1057:20 + ldr r0, [sp, #800] @ 4-byte Reload + add.w r0, r0, r11, lsl #4 + vld1.64 {d16, d17}, [r0:128] + .loc 1 1057 5 is_stmt 0 @ ../crypto/hrss/hrss.c:1057:5 + add.w r0, lr, r11, lsl #4 + .loc 1 1057 18 @ ../crypto/hrss/hrss.c:1057:18 + vst1.64 {d16, d17}, [r0:128] + .loc 1 1058 31 is_stmt 1 @ ../crypto/hrss/hrss.c:1058:31 + ldr r0, [sp, #816] @ 4-byte Reload + add.w r0, r0, r11, lsl #4 + vld1.64 {d16, d17}, [r0:128] + .loc 1 1058 5 is_stmt 0 @ ../crypto/hrss/hrss.c:1058:5 + add.w r0, lr, r4, lsl #4 + .loc 1 1058 29 @ ../crypto/hrss/hrss.c:1058:29 + vst1.64 {d16, d17}, [r0:128] +.LBB3_9: + .loc 1 0 29 @ ../crypto/hrss/hrss.c:0:29 + ldr r4, [sp, #832] @ 4-byte Reload + mov r6, r11 + .loc 1 1063 3 is_stmt 1 @ ../crypto/hrss/hrss.c:1063:3 + mov r0, r9 + mov r2, lr + str r4, [sp] + mov r5, lr + .loc 1 1061 33 @ ../crypto/hrss/hrss.c:1061:33 + add.w r11, r9, r4, lsl #5 + .loc 1 1063 50 @ ../crypto/hrss/hrss.c:1063:50 + add.w r3, lr, r4, lsl #4 + .loc 1 1063 3 is_stmt 0 @ ../crypto/hrss/hrss.c:1063:3 + mov r1, r11 + bl poly_mul_vec_aux + .loc 1 1065 33 is_stmt 1 @ ../crypto/hrss/hrss.c:1065:33 + ldr r0, [r7, #8] + .loc 1 1065 3 is_stmt 0 @ ../crypto/hrss/hrss.c:1065:3 + mov r1, r11 + str r4, [sp] + .loc 1 1065 33 @ ../crypto/hrss/hrss.c:1065:33 + bic r4, r0, #1 + .loc 1 1065 3 @ ../crypto/hrss/hrss.c:1065:3 + ldr r2, [sp, #800] @ 4-byte Reload + ldr r3, [sp, #816] @ 4-byte Reload + .loc 1 1065 21 @ ../crypto/hrss/hrss.c:1065:21 + add.w r0, r5, r4, lsl #4 + .loc 1 1065 3 @ ../crypto/hrss/hrss.c:1065:3 + bl poly_mul_vec_aux + .loc 1 1067 3 is_stmt 1 @ ../crypto/hrss/hrss.c:1067:3 + mov r1, r11 + mov r0, r5 + mov r2, r10 + mov r3, r8 + str r6, [sp] + mov r11, r6 + bl poly_mul_vec_aux + .loc 1 1070 3 @ ../crypto/hrss/hrss.c:1070:3 + cbz r4, .LBB3_12 +@ %bb.10: + .loc 1 1071 26 @ ../crypto/hrss/hrss.c:1071:26 + lsl.w r0, r11, #5 + mov r1, r4 + mov r2, r5 + mov r3, r9 +.LBB3_11: @ =>This Inner Loop Header: Depth=1 + .loc 1 1071 54 is_stmt 0 @ ../crypto/hrss/hrss.c:1071:54 + adds r6, r2, r0 + .loc 1 1071 46 @ ../crypto/hrss/hrss.c:1071:46 + vld1.16 {d16, d17}, [r2:128]! + .loc 1 1070 24 is_stmt 1 @ ../crypto/hrss/hrss.c:1070:24 + subs r1, #1 + .loc 1 1071 26 @ ../crypto/hrss/hrss.c:1071:26 + vld1.64 {d18, d19}, [r3:128] +.Ltmp644: + .loc 1 155 58 @ ../crypto/hrss/hrss.c:155:58 + vsub.i16 q8, q9, q8 +.Ltmp645: + .loc 1 1071 54 @ ../crypto/hrss/hrss.c:1071:54 + vld1.64 {d20, d21}, [r6:128] +.Ltmp646: + .loc 1 157 58 @ ../crypto/hrss/hrss.c:157:58 + vsub.i16 q8, q8, q10 +.Ltmp647: + .loc 1 1071 16 @ ../crypto/hrss/hrss.c:1071:16 + vst1.16 {d16, d17}, [r3:128]! + .loc 1 1070 3 @ ../crypto/hrss/hrss.c:1070:3 + bne .LBB3_11 +.LBB3_12: + .loc 1 0 3 is_stmt 0 @ ../crypto/hrss/hrss.c:0:3 + ldr r1, [sp, #832] @ 4-byte Reload + .loc 1 1073 7 is_stmt 1 @ ../crypto/hrss/hrss.c:1073:7 + cmp r1, r11 + lsl.w r0, r1, #1 + beq .LBB3_14 +@ %bb.13: + .loc 1 1074 58 @ ../crypto/hrss/hrss.c:1074:58 + add.w r1, r5, r11, lsl #6 + vld1.64 {d16, d17}, [r1:128] + .loc 1 1074 36 is_stmt 0 @ ../crypto/hrss/hrss.c:1074:36 + add.w r1, r9, r4, lsl #4 + vld1.64 {d18, d19}, [r1:128] +.Ltmp648: + .loc 1 157 58 is_stmt 1 @ ../crypto/hrss/hrss.c:157:58 + vsub.i16 q8, q9, q8 +.Ltmp649: + .loc 1 1074 26 @ ../crypto/hrss/hrss.c:1074:26 + vst1.64 {d16, d17}, [r1:128] + movs r1, #16 + .loc 1 1076 43 @ ../crypto/hrss/hrss.c:1076:43 + orr.w r1, r1, r11, lsl #6 + add r1, r5 + vld1.64 {d16, d17}, [r1:128] + .loc 1 1076 17 is_stmt 0 @ ../crypto/hrss/hrss.c:1076:17 + ldr r1, [r7, #8] + orr r1, r1, #1 + add.w r1, r9, r1, lsl #4 + vld1.64 {d18, d19}, [r1:128] +.Ltmp650: + .loc 1 157 58 is_stmt 1 @ ../crypto/hrss/hrss.c:157:58 + vsub.i16 q8, q9, q8 +.Ltmp651: + .loc 1 1075 30 @ ../crypto/hrss/hrss.c:1075:30 + vst1.64 {d16, d17}, [r1:128] +.LBB3_14: + .loc 1 1080 3 @ ../crypto/hrss/hrss.c:1080:3 + cbz r0, .LBB3_17 +@ %bb.15: + .loc 1 1081 44 @ ../crypto/hrss/hrss.c:1081:44 + add.w r1, r5, r11, lsl #4 +.LBB3_16: @ =>This Inner Loop Header: Depth=1 + .loc 1 1081 32 is_stmt 0 @ ../crypto/hrss/hrss.c:1081:32 + vld1.64 {d16, d17}, [r1:128] + .loc 1 1080 24 is_stmt 1 @ ../crypto/hrss/hrss.c:1080:24 + subs r0, #1 + .loc 1 1081 50 @ ../crypto/hrss/hrss.c:1081:50 + vld1.16 {d18, d19}, [r9:128]! +.Ltmp652: + .loc 1 155 58 @ ../crypto/hrss/hrss.c:155:58 + vadd.i16 q8, q9, q8 +.Ltmp653: + .loc 1 1081 22 @ ../crypto/hrss/hrss.c:1081:22 + vst1.16 {d16, d17}, [r1:128]! + .loc 1 1080 3 @ ../crypto/hrss/hrss.c:1080:3 + bne .LBB3_16 +.LBB3_17: + .loc 1 1083 1 @ ../crypto/hrss/hrss.c:1083:1 + sub.w r4, r7, #96 + mov sp, r4 + vpop {d8, d9, d10, d11, d12, d13, d14, d15} + add sp, #4 + pop.w {r8, r9, r10, r11} + pop {r4, r5, r6, r7, pc} +.Ltmp654: +.Lfunc_end3: + .size poly_mul_vec_aux, .Lfunc_end3-poly_mul_vec_aux + .cfi_endproc + .fnend + + .section ".note.GNU-stack","",%progbits + .section .debug_line,"",%progbits + +#endif diff --git a/crypto/hrss/asm/poly_rq_mul.S b/crypto/hrss/asm/poly_rq_mul.S new file mode 100644 index 00000000..0ad0fb51 --- /dev/null +++ b/crypto/hrss/asm/poly_rq_mul.S @@ -0,0 +1,8457 @@ +// Copyright (c) 2017, the HRSS authors. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_SMALL) && defined(__linux__) + +// This is the polynomial multiplication function from [HRSS], provided by kind +// permission of the authors. +// +// HRSS: https://eprint.iacr.org/2017/1005 + +# This file was generated by poly_rq_mul.py +.text +.align 32 +mask_low9words: +.word 0xffff +.word 0xffff +.word 0xffff +.word 0xffff +.word 0xffff +.word 0xffff +.word 0xffff +.word 0xffff +.word 0xffff +.word 0x0 +.word 0x0 +.word 0x0 +.word 0x0 +.word 0x0 +.word 0x0 +.word 0x0 +const3: +.word 3 +.word 3 +.word 3 +.word 3 +.word 3 +.word 3 +.word 3 +.word 3 +.word 3 +.word 3 +.word 3 +.word 3 +.word 3 +.word 3 +.word 3 +.word 3 +const9: +.word 9 +.word 9 +.word 9 +.word 9 +.word 9 +.word 9 +.word 9 +.word 9 +.word 9 +.word 9 +.word 9 +.word 9 +.word 9 +.word 9 +.word 9 +.word 9 +const0: +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +const729: +.word 729 +.word 729 +.word 729 +.word 729 +.word 729 +.word 729 +.word 729 +.word 729 +.word 729 +.word 729 +.word 729 +.word 729 +.word 729 +.word 729 +.word 729 +.word 729 +const3_inv: +.word 43691 +.word 43691 +.word 43691 +.word 43691 +.word 43691 +.word 43691 +.word 43691 +.word 43691 +.word 43691 +.word 43691 +.word 43691 +.word 43691 +.word 43691 +.word 43691 +.word 43691 +.word 43691 +const5_inv: +.word 52429 +.word 52429 +.word 52429 +.word 52429 +.word 52429 +.word 52429 +.word 52429 +.word 52429 +.word 52429 +.word 52429 +.word 52429 +.word 52429 +.word 52429 +.word 52429 +.word 52429 +.word 52429 +shuf48_16: +.byte 10 +.byte 11 +.byte 12 +.byte 13 +.byte 14 +.byte 15 +.byte 0 +.byte 1 +.byte 2 +.byte 3 +.byte 4 +.byte 5 +.byte 6 +.byte 7 +.byte 8 +.byte 9 +.byte 10 +.byte 11 +.byte 12 +.byte 13 +.byte 14 +.byte 15 +.byte 0 +.byte 1 +.byte 2 +.byte 3 +.byte 4 +.byte 5 +.byte 6 +.byte 7 +.byte 8 +.byte 9 +shufmin1_mask3: +.byte 2 +.byte 3 +.byte 4 +.byte 5 +.byte 6 +.byte 7 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +mask32_to_16: +.word 0xffff +.word 0x0 +.word 0xffff +.word 0x0 +.word 0xffff +.word 0x0 +.word 0xffff +.word 0x0 +.word 0xffff +.word 0x0 +.word 0xffff +.word 0x0 +.word 0xffff +.word 0x0 +.word 0xffff +.word 0x0 +mask5_3_5_3: +.word 0 +.word 0 +.word 0 +.word 65535 +.word 65535 +.word 65535 +.word 65535 +.word 65535 +.word 0 +.word 0 +.word 0 +.word 65535 +.word 65535 +.word 65535 +.word 65535 +.word 65535 +mask3_5_3_5: +.word 65535 +.word 65535 +.word 65535 +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +.word 65535 +.word 65535 +.word 65535 +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +mask3_5_4_3_1: +.word 65535 +.word 65535 +.word 65535 +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +.word 65535 +.word 65535 +.word 65535 +.word 0 +mask_keephigh: +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +.word 65535 +.word 65535 +.word 65535 +.word 65535 +.word 65535 +.word 65535 +.word 65535 +.word 65535 +mask_mod8192: +.word 8191 +.word 8191 +.word 8191 +.word 8191 +.word 8191 +.word 8191 +.word 8191 +.word 8191 +.word 8191 +.word 8191 +.word 8191 +.word 8191 +.word 8191 +.word 8191 +.word 8191 +.word 8191 +.text +.global poly_Rq_mul +.hidden poly_Rq_mul +.att_syntax prefix +poly_Rq_mul: +.cfi_startproc +push %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset rbp, -16 +movq %rsp, %rbp +.cfi_def_cfa_register rbp +push %r12 +.cfi_offset r12, -24 +mov %rsp, %r8 +andq $-32, %rsp +subq $6144, %rsp +mov %rsp, %rax +subq $6144, %rsp +mov %rsp, %r11 +subq $12288, %rsp +mov %rsp, %r12 +subq $512, %rsp +vmovdqa const3(%rip), %ymm3 +vmovdqu 0(%rsi), %ymm0 +vmovdqu 88(%rsi), %ymm1 +vmovdqu 176(%rsi), %ymm2 +vmovdqu 264(%rsi), %ymm12 +vmovdqu 1056(%rsi), %ymm4 +vmovdqu 1144(%rsi), %ymm5 +vmovdqu 1232(%rsi), %ymm6 +vmovdqu 1320(%rsi), %ymm7 +vmovdqu 352(%rsi), %ymm8 +vmovdqu 440(%rsi), %ymm9 +vmovdqu 528(%rsi), %ymm10 +vmovdqu 616(%rsi), %ymm11 +vmovdqa %ymm0, 0(%rax) +vmovdqa %ymm1, 96(%rax) +vpaddw %ymm0, %ymm1, %ymm14 +vmovdqa %ymm14, 192(%rax) +vmovdqa %ymm2, 288(%rax) +vmovdqa %ymm12, 384(%rax) +vpaddw %ymm2, %ymm12, %ymm14 +vmovdqa %ymm14, 480(%rax) +vpaddw %ymm0, %ymm2, %ymm14 +vmovdqa %ymm14, 576(%rax) +vpaddw %ymm1, %ymm12, %ymm15 +vmovdqa %ymm15, 672(%rax) +vpaddw %ymm14, %ymm15, %ymm14 +vmovdqa %ymm14, 768(%rax) +vmovdqa %ymm4, 5184(%rax) +vmovdqa %ymm5, 5280(%rax) +vpaddw %ymm4, %ymm5, %ymm14 +vmovdqa %ymm14, 5376(%rax) +vmovdqa %ymm6, 5472(%rax) +vmovdqa %ymm7, 5568(%rax) +vpaddw %ymm6, %ymm7, %ymm14 +vmovdqa %ymm14, 5664(%rax) +vpaddw %ymm4, %ymm6, %ymm14 +vmovdqa %ymm14, 5760(%rax) +vpaddw %ymm5, %ymm7, %ymm15 +vmovdqa %ymm15, 5856(%rax) +vpaddw %ymm14, %ymm15, %ymm14 +vmovdqa %ymm14, 5952(%rax) +vmovdqa %ymm0, 0(%rsp) +vmovdqa %ymm1, 32(%rsp) +vmovdqa %ymm2, 64(%rsp) +vmovdqa %ymm12, 96(%rsp) +vmovdqa %ymm8, 128(%rsp) +vmovdqa %ymm9, 160(%rsp) +vmovdqa %ymm10, 192(%rsp) +vmovdqa %ymm11, 224(%rsp) +vmovdqu 704(%rsi), %ymm0 +vpaddw 0(%rsp), %ymm0, %ymm1 +vpaddw 128(%rsp), %ymm4, %ymm2 +vpaddw %ymm2, %ymm1, %ymm8 +vpsubw %ymm2, %ymm1, %ymm12 +vmovdqa %ymm0, 256(%rsp) +vmovdqu 792(%rsi), %ymm0 +vpaddw 32(%rsp), %ymm0, %ymm1 +vpaddw 160(%rsp), %ymm5, %ymm2 +vpaddw %ymm2, %ymm1, %ymm9 +vpsubw %ymm2, %ymm1, %ymm13 +vmovdqa %ymm0, 288(%rsp) +vmovdqu 880(%rsi), %ymm0 +vpaddw 64(%rsp), %ymm0, %ymm1 +vpaddw 192(%rsp), %ymm6, %ymm2 +vpaddw %ymm2, %ymm1, %ymm10 +vpsubw %ymm2, %ymm1, %ymm14 +vmovdqa %ymm0, 320(%rsp) +vmovdqu 968(%rsi), %ymm0 +vpaddw 96(%rsp), %ymm0, %ymm1 +vpaddw 224(%rsp), %ymm7, %ymm2 +vpaddw %ymm2, %ymm1, %ymm11 +vpsubw %ymm2, %ymm1, %ymm15 +vmovdqa %ymm0, 352(%rsp) +vmovdqa %ymm8, 864(%rax) +vmovdqa %ymm9, 960(%rax) +vpaddw %ymm8, %ymm9, %ymm0 +vmovdqa %ymm0, 1056(%rax) +vmovdqa %ymm10, 1152(%rax) +vmovdqa %ymm11, 1248(%rax) +vpaddw %ymm10, %ymm11, %ymm0 +vmovdqa %ymm0, 1344(%rax) +vpaddw %ymm8, %ymm10, %ymm0 +vmovdqa %ymm0, 1440(%rax) +vpaddw %ymm9, %ymm11, %ymm1 +vmovdqa %ymm1, 1536(%rax) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 1632(%rax) +vmovdqa %ymm12, 1728(%rax) +vmovdqa %ymm13, 1824(%rax) +vpaddw %ymm12, %ymm13, %ymm0 +vmovdqa %ymm0, 1920(%rax) +vmovdqa %ymm14, 2016(%rax) +vmovdqa %ymm15, 2112(%rax) +vpaddw %ymm14, %ymm15, %ymm0 +vmovdqa %ymm0, 2208(%rax) +vpaddw %ymm12, %ymm14, %ymm0 +vmovdqa %ymm0, 2304(%rax) +vpaddw %ymm13, %ymm15, %ymm1 +vmovdqa %ymm1, 2400(%rax) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 2496(%rax) +vmovdqa 256(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 0(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm4, %ymm1 +vpaddw 128(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm8 +vpsubw %ymm1, %ymm0, %ymm12 +vmovdqa 288(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 32(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm5, %ymm1 +vpaddw 160(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm9 +vpsubw %ymm1, %ymm0, %ymm13 +vmovdqa 320(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 64(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm6, %ymm1 +vpaddw 192(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm10 +vpsubw %ymm1, %ymm0, %ymm14 +vmovdqa 352(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 96(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm7, %ymm1 +vpaddw 224(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm11 +vpsubw %ymm1, %ymm0, %ymm15 +vmovdqa %ymm8, 2592(%rax) +vmovdqa %ymm9, 2688(%rax) +vpaddw %ymm8, %ymm9, %ymm0 +vmovdqa %ymm0, 2784(%rax) +vmovdqa %ymm10, 2880(%rax) +vmovdqa %ymm11, 2976(%rax) +vpaddw %ymm10, %ymm11, %ymm0 +vmovdqa %ymm0, 3072(%rax) +vpaddw %ymm8, %ymm10, %ymm0 +vmovdqa %ymm0, 3168(%rax) +vpaddw %ymm9, %ymm11, %ymm1 +vmovdqa %ymm1, 3264(%rax) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 3360(%rax) +vmovdqa %ymm12, 3456(%rax) +vmovdqa %ymm13, 3552(%rax) +vpaddw %ymm12, %ymm13, %ymm0 +vmovdqa %ymm0, 3648(%rax) +vmovdqa %ymm14, 3744(%rax) +vmovdqa %ymm15, 3840(%rax) +vpaddw %ymm14, %ymm15, %ymm0 +vmovdqa %ymm0, 3936(%rax) +vpaddw %ymm12, %ymm14, %ymm0 +vmovdqa %ymm0, 4032(%rax) +vpaddw %ymm13, %ymm15, %ymm1 +vmovdqa %ymm1, 4128(%rax) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 4224(%rax) +vpmullw %ymm3, %ymm4, %ymm0 +vpaddw 256(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 128(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 0(%rsp), %ymm0, %ymm12 +vpmullw %ymm3, %ymm5, %ymm0 +vpaddw 288(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 160(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 32(%rsp), %ymm0, %ymm13 +vpmullw %ymm3, %ymm6, %ymm0 +vpaddw 320(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 192(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 64(%rsp), %ymm0, %ymm14 +vpmullw %ymm3, %ymm7, %ymm0 +vpaddw 352(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 224(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 96(%rsp), %ymm0, %ymm15 +vmovdqa %ymm12, 4320(%rax) +vmovdqa %ymm13, 4416(%rax) +vpaddw %ymm12, %ymm13, %ymm0 +vmovdqa %ymm0, 4512(%rax) +vmovdqa %ymm14, 4608(%rax) +vmovdqa %ymm15, 4704(%rax) +vpaddw %ymm14, %ymm15, %ymm0 +vmovdqa %ymm0, 4800(%rax) +vpaddw %ymm12, %ymm14, %ymm0 +vmovdqa %ymm0, 4896(%rax) +vpaddw %ymm13, %ymm15, %ymm1 +vmovdqa %ymm1, 4992(%rax) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 5088(%rax) +vmovdqu 32(%rsi), %ymm0 +vmovdqu 120(%rsi), %ymm1 +vmovdqu 208(%rsi), %ymm2 +vmovdqu 296(%rsi), %ymm12 +vmovdqu 1088(%rsi), %ymm4 +vmovdqu 1176(%rsi), %ymm5 +vmovdqu 1264(%rsi), %ymm6 +vmovdqu 1352(%rsi), %ymm7 +vmovdqu 384(%rsi), %ymm8 +vmovdqu 472(%rsi), %ymm9 +vmovdqu 560(%rsi), %ymm10 +vmovdqu 648(%rsi), %ymm11 +vmovdqa %ymm0, 32(%rax) +vmovdqa %ymm1, 128(%rax) +vpaddw %ymm0, %ymm1, %ymm14 +vmovdqa %ymm14, 224(%rax) +vmovdqa %ymm2, 320(%rax) +vmovdqa %ymm12, 416(%rax) +vpaddw %ymm2, %ymm12, %ymm14 +vmovdqa %ymm14, 512(%rax) +vpaddw %ymm0, %ymm2, %ymm14 +vmovdqa %ymm14, 608(%rax) +vpaddw %ymm1, %ymm12, %ymm15 +vmovdqa %ymm15, 704(%rax) +vpaddw %ymm14, %ymm15, %ymm14 +vmovdqa %ymm14, 800(%rax) +vmovdqa %ymm4, 5216(%rax) +vmovdqa %ymm5, 5312(%rax) +vpaddw %ymm4, %ymm5, %ymm14 +vmovdqa %ymm14, 5408(%rax) +vmovdqa %ymm6, 5504(%rax) +vmovdqa %ymm7, 5600(%rax) +vpaddw %ymm6, %ymm7, %ymm14 +vmovdqa %ymm14, 5696(%rax) +vpaddw %ymm4, %ymm6, %ymm14 +vmovdqa %ymm14, 5792(%rax) +vpaddw %ymm5, %ymm7, %ymm15 +vmovdqa %ymm15, 5888(%rax) +vpaddw %ymm14, %ymm15, %ymm14 +vmovdqa %ymm14, 5984(%rax) +vmovdqa %ymm0, 0(%rsp) +vmovdqa %ymm1, 32(%rsp) +vmovdqa %ymm2, 64(%rsp) +vmovdqa %ymm12, 96(%rsp) +vmovdqa %ymm8, 128(%rsp) +vmovdqa %ymm9, 160(%rsp) +vmovdqa %ymm10, 192(%rsp) +vmovdqa %ymm11, 224(%rsp) +vmovdqu 736(%rsi), %ymm0 +vpaddw 0(%rsp), %ymm0, %ymm1 +vpaddw 128(%rsp), %ymm4, %ymm2 +vpaddw %ymm2, %ymm1, %ymm8 +vpsubw %ymm2, %ymm1, %ymm12 +vmovdqa %ymm0, 256(%rsp) +vmovdqu 824(%rsi), %ymm0 +vpaddw 32(%rsp), %ymm0, %ymm1 +vpaddw 160(%rsp), %ymm5, %ymm2 +vpaddw %ymm2, %ymm1, %ymm9 +vpsubw %ymm2, %ymm1, %ymm13 +vmovdqa %ymm0, 288(%rsp) +vmovdqu 912(%rsi), %ymm0 +vpaddw 64(%rsp), %ymm0, %ymm1 +vpaddw 192(%rsp), %ymm6, %ymm2 +vpaddw %ymm2, %ymm1, %ymm10 +vpsubw %ymm2, %ymm1, %ymm14 +vmovdqa %ymm0, 320(%rsp) +vmovdqu 1000(%rsi), %ymm0 +vpaddw 96(%rsp), %ymm0, %ymm1 +vpaddw 224(%rsp), %ymm7, %ymm2 +vpaddw %ymm2, %ymm1, %ymm11 +vpsubw %ymm2, %ymm1, %ymm15 +vmovdqa %ymm0, 352(%rsp) +vmovdqa %ymm8, 896(%rax) +vmovdqa %ymm9, 992(%rax) +vpaddw %ymm8, %ymm9, %ymm0 +vmovdqa %ymm0, 1088(%rax) +vmovdqa %ymm10, 1184(%rax) +vmovdqa %ymm11, 1280(%rax) +vpaddw %ymm10, %ymm11, %ymm0 +vmovdqa %ymm0, 1376(%rax) +vpaddw %ymm8, %ymm10, %ymm0 +vmovdqa %ymm0, 1472(%rax) +vpaddw %ymm9, %ymm11, %ymm1 +vmovdqa %ymm1, 1568(%rax) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 1664(%rax) +vmovdqa %ymm12, 1760(%rax) +vmovdqa %ymm13, 1856(%rax) +vpaddw %ymm12, %ymm13, %ymm0 +vmovdqa %ymm0, 1952(%rax) +vmovdqa %ymm14, 2048(%rax) +vmovdqa %ymm15, 2144(%rax) +vpaddw %ymm14, %ymm15, %ymm0 +vmovdqa %ymm0, 2240(%rax) +vpaddw %ymm12, %ymm14, %ymm0 +vmovdqa %ymm0, 2336(%rax) +vpaddw %ymm13, %ymm15, %ymm1 +vmovdqa %ymm1, 2432(%rax) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 2528(%rax) +vmovdqa 256(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 0(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm4, %ymm1 +vpaddw 128(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm8 +vpsubw %ymm1, %ymm0, %ymm12 +vmovdqa 288(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 32(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm5, %ymm1 +vpaddw 160(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm9 +vpsubw %ymm1, %ymm0, %ymm13 +vmovdqa 320(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 64(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm6, %ymm1 +vpaddw 192(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm10 +vpsubw %ymm1, %ymm0, %ymm14 +vmovdqa 352(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 96(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm7, %ymm1 +vpaddw 224(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm11 +vpsubw %ymm1, %ymm0, %ymm15 +vmovdqa %ymm8, 2624(%rax) +vmovdqa %ymm9, 2720(%rax) +vpaddw %ymm8, %ymm9, %ymm0 +vmovdqa %ymm0, 2816(%rax) +vmovdqa %ymm10, 2912(%rax) +vmovdqa %ymm11, 3008(%rax) +vpaddw %ymm10, %ymm11, %ymm0 +vmovdqa %ymm0, 3104(%rax) +vpaddw %ymm8, %ymm10, %ymm0 +vmovdqa %ymm0, 3200(%rax) +vpaddw %ymm9, %ymm11, %ymm1 +vmovdqa %ymm1, 3296(%rax) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 3392(%rax) +vmovdqa %ymm12, 3488(%rax) +vmovdqa %ymm13, 3584(%rax) +vpaddw %ymm12, %ymm13, %ymm0 +vmovdqa %ymm0, 3680(%rax) +vmovdqa %ymm14, 3776(%rax) +vmovdqa %ymm15, 3872(%rax) +vpaddw %ymm14, %ymm15, %ymm0 +vmovdqa %ymm0, 3968(%rax) +vpaddw %ymm12, %ymm14, %ymm0 +vmovdqa %ymm0, 4064(%rax) +vpaddw %ymm13, %ymm15, %ymm1 +vmovdqa %ymm1, 4160(%rax) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 4256(%rax) +vpmullw %ymm3, %ymm4, %ymm0 +vpaddw 256(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 128(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 0(%rsp), %ymm0, %ymm12 +vpmullw %ymm3, %ymm5, %ymm0 +vpaddw 288(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 160(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 32(%rsp), %ymm0, %ymm13 +vpmullw %ymm3, %ymm6, %ymm0 +vpaddw 320(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 192(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 64(%rsp), %ymm0, %ymm14 +vpmullw %ymm3, %ymm7, %ymm0 +vpaddw 352(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 224(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 96(%rsp), %ymm0, %ymm15 +vmovdqa %ymm12, 4352(%rax) +vmovdqa %ymm13, 4448(%rax) +vpaddw %ymm12, %ymm13, %ymm0 +vmovdqa %ymm0, 4544(%rax) +vmovdqa %ymm14, 4640(%rax) +vmovdqa %ymm15, 4736(%rax) +vpaddw %ymm14, %ymm15, %ymm0 +vmovdqa %ymm0, 4832(%rax) +vpaddw %ymm12, %ymm14, %ymm0 +vmovdqa %ymm0, 4928(%rax) +vpaddw %ymm13, %ymm15, %ymm1 +vmovdqa %ymm1, 5024(%rax) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 5120(%rax) +vmovdqu 64(%rsi), %ymm0 +vmovdqu 152(%rsi), %ymm1 +vmovdqu 240(%rsi), %ymm2 +vmovdqu 328(%rsi), %ymm12 +vmovdqu 1120(%rsi), %ymm4 +vmovdqu 1208(%rsi), %ymm5 +vmovdqu 1296(%rsi), %ymm6 +vmovdqu 1384(%rsi), %ymm7 +vpand mask_low9words(%rip), %ymm7, %ymm7 +vmovdqu 416(%rsi), %ymm8 +vmovdqu 504(%rsi), %ymm9 +vmovdqu 592(%rsi), %ymm10 +vmovdqu 680(%rsi), %ymm11 +vmovdqa %ymm0, 64(%rax) +vmovdqa %ymm1, 160(%rax) +vpaddw %ymm0, %ymm1, %ymm14 +vmovdqa %ymm14, 256(%rax) +vmovdqa %ymm2, 352(%rax) +vmovdqa %ymm12, 448(%rax) +vpaddw %ymm2, %ymm12, %ymm14 +vmovdqa %ymm14, 544(%rax) +vpaddw %ymm0, %ymm2, %ymm14 +vmovdqa %ymm14, 640(%rax) +vpaddw %ymm1, %ymm12, %ymm15 +vmovdqa %ymm15, 736(%rax) +vpaddw %ymm14, %ymm15, %ymm14 +vmovdqa %ymm14, 832(%rax) +vmovdqa %ymm4, 5248(%rax) +vmovdqa %ymm5, 5344(%rax) +vpaddw %ymm4, %ymm5, %ymm14 +vmovdqa %ymm14, 5440(%rax) +vmovdqa %ymm6, 5536(%rax) +vmovdqa %ymm7, 5632(%rax) +vpaddw %ymm6, %ymm7, %ymm14 +vmovdqa %ymm14, 5728(%rax) +vpaddw %ymm4, %ymm6, %ymm14 +vmovdqa %ymm14, 5824(%rax) +vpaddw %ymm5, %ymm7, %ymm15 +vmovdqa %ymm15, 5920(%rax) +vpaddw %ymm14, %ymm15, %ymm14 +vmovdqa %ymm14, 6016(%rax) +vmovdqa %ymm0, 0(%rsp) +vmovdqa %ymm1, 32(%rsp) +vmovdqa %ymm2, 64(%rsp) +vmovdqa %ymm12, 96(%rsp) +vmovdqa %ymm8, 128(%rsp) +vmovdqa %ymm9, 160(%rsp) +vmovdqa %ymm10, 192(%rsp) +vmovdqa %ymm11, 224(%rsp) +vmovdqu 768(%rsi), %ymm0 +vpaddw 0(%rsp), %ymm0, %ymm1 +vpaddw 128(%rsp), %ymm4, %ymm2 +vpaddw %ymm2, %ymm1, %ymm8 +vpsubw %ymm2, %ymm1, %ymm12 +vmovdqa %ymm0, 256(%rsp) +vmovdqu 856(%rsi), %ymm0 +vpaddw 32(%rsp), %ymm0, %ymm1 +vpaddw 160(%rsp), %ymm5, %ymm2 +vpaddw %ymm2, %ymm1, %ymm9 +vpsubw %ymm2, %ymm1, %ymm13 +vmovdqa %ymm0, 288(%rsp) +vmovdqu 944(%rsi), %ymm0 +vpaddw 64(%rsp), %ymm0, %ymm1 +vpaddw 192(%rsp), %ymm6, %ymm2 +vpaddw %ymm2, %ymm1, %ymm10 +vpsubw %ymm2, %ymm1, %ymm14 +vmovdqa %ymm0, 320(%rsp) +vmovdqu 1032(%rsi), %ymm0 +vpaddw 96(%rsp), %ymm0, %ymm1 +vpaddw 224(%rsp), %ymm7, %ymm2 +vpaddw %ymm2, %ymm1, %ymm11 +vpsubw %ymm2, %ymm1, %ymm15 +vmovdqa %ymm0, 352(%rsp) +vmovdqa %ymm8, 928(%rax) +vmovdqa %ymm9, 1024(%rax) +vpaddw %ymm8, %ymm9, %ymm0 +vmovdqa %ymm0, 1120(%rax) +vmovdqa %ymm10, 1216(%rax) +vmovdqa %ymm11, 1312(%rax) +vpaddw %ymm10, %ymm11, %ymm0 +vmovdqa %ymm0, 1408(%rax) +vpaddw %ymm8, %ymm10, %ymm0 +vmovdqa %ymm0, 1504(%rax) +vpaddw %ymm9, %ymm11, %ymm1 +vmovdqa %ymm1, 1600(%rax) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 1696(%rax) +vmovdqa %ymm12, 1792(%rax) +vmovdqa %ymm13, 1888(%rax) +vpaddw %ymm12, %ymm13, %ymm0 +vmovdqa %ymm0, 1984(%rax) +vmovdqa %ymm14, 2080(%rax) +vmovdqa %ymm15, 2176(%rax) +vpaddw %ymm14, %ymm15, %ymm0 +vmovdqa %ymm0, 2272(%rax) +vpaddw %ymm12, %ymm14, %ymm0 +vmovdqa %ymm0, 2368(%rax) +vpaddw %ymm13, %ymm15, %ymm1 +vmovdqa %ymm1, 2464(%rax) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 2560(%rax) +vmovdqa 256(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 0(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm4, %ymm1 +vpaddw 128(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm8 +vpsubw %ymm1, %ymm0, %ymm12 +vmovdqa 288(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 32(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm5, %ymm1 +vpaddw 160(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm9 +vpsubw %ymm1, %ymm0, %ymm13 +vmovdqa 320(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 64(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm6, %ymm1 +vpaddw 192(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm10 +vpsubw %ymm1, %ymm0, %ymm14 +vmovdqa 352(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 96(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm7, %ymm1 +vpaddw 224(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm11 +vpsubw %ymm1, %ymm0, %ymm15 +vmovdqa %ymm8, 2656(%rax) +vmovdqa %ymm9, 2752(%rax) +vpaddw %ymm8, %ymm9, %ymm0 +vmovdqa %ymm0, 2848(%rax) +vmovdqa %ymm10, 2944(%rax) +vmovdqa %ymm11, 3040(%rax) +vpaddw %ymm10, %ymm11, %ymm0 +vmovdqa %ymm0, 3136(%rax) +vpaddw %ymm8, %ymm10, %ymm0 +vmovdqa %ymm0, 3232(%rax) +vpaddw %ymm9, %ymm11, %ymm1 +vmovdqa %ymm1, 3328(%rax) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 3424(%rax) +vmovdqa %ymm12, 3520(%rax) +vmovdqa %ymm13, 3616(%rax) +vpaddw %ymm12, %ymm13, %ymm0 +vmovdqa %ymm0, 3712(%rax) +vmovdqa %ymm14, 3808(%rax) +vmovdqa %ymm15, 3904(%rax) +vpaddw %ymm14, %ymm15, %ymm0 +vmovdqa %ymm0, 4000(%rax) +vpaddw %ymm12, %ymm14, %ymm0 +vmovdqa %ymm0, 4096(%rax) +vpaddw %ymm13, %ymm15, %ymm1 +vmovdqa %ymm1, 4192(%rax) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 4288(%rax) +vpmullw %ymm3, %ymm4, %ymm0 +vpaddw 256(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 128(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 0(%rsp), %ymm0, %ymm12 +vpmullw %ymm3, %ymm5, %ymm0 +vpaddw 288(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 160(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 32(%rsp), %ymm0, %ymm13 +vpmullw %ymm3, %ymm6, %ymm0 +vpaddw 320(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 192(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 64(%rsp), %ymm0, %ymm14 +vpmullw %ymm3, %ymm7, %ymm0 +vpaddw 352(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 224(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 96(%rsp), %ymm0, %ymm15 +vmovdqa %ymm12, 4384(%rax) +vmovdqa %ymm13, 4480(%rax) +vpaddw %ymm12, %ymm13, %ymm0 +vmovdqa %ymm0, 4576(%rax) +vmovdqa %ymm14, 4672(%rax) +vmovdqa %ymm15, 4768(%rax) +vpaddw %ymm14, %ymm15, %ymm0 +vmovdqa %ymm0, 4864(%rax) +vpaddw %ymm12, %ymm14, %ymm0 +vmovdqa %ymm0, 4960(%rax) +vpaddw %ymm13, %ymm15, %ymm1 +vmovdqa %ymm1, 5056(%rax) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 5152(%rax) +vmovdqu 0(%rdx), %ymm0 +vmovdqu 88(%rdx), %ymm1 +vmovdqu 176(%rdx), %ymm2 +vmovdqu 264(%rdx), %ymm12 +vmovdqu 1056(%rdx), %ymm4 +vmovdqu 1144(%rdx), %ymm5 +vmovdqu 1232(%rdx), %ymm6 +vmovdqu 1320(%rdx), %ymm7 +vmovdqu 352(%rdx), %ymm8 +vmovdqu 440(%rdx), %ymm9 +vmovdqu 528(%rdx), %ymm10 +vmovdqu 616(%rdx), %ymm11 +vmovdqa %ymm0, 0(%r11) +vmovdqa %ymm1, 96(%r11) +vpaddw %ymm0, %ymm1, %ymm14 +vmovdqa %ymm14, 192(%r11) +vmovdqa %ymm2, 288(%r11) +vmovdqa %ymm12, 384(%r11) +vpaddw %ymm2, %ymm12, %ymm14 +vmovdqa %ymm14, 480(%r11) +vpaddw %ymm0, %ymm2, %ymm14 +vmovdqa %ymm14, 576(%r11) +vpaddw %ymm1, %ymm12, %ymm15 +vmovdqa %ymm15, 672(%r11) +vpaddw %ymm14, %ymm15, %ymm14 +vmovdqa %ymm14, 768(%r11) +vmovdqa %ymm4, 5184(%r11) +vmovdqa %ymm5, 5280(%r11) +vpaddw %ymm4, %ymm5, %ymm14 +vmovdqa %ymm14, 5376(%r11) +vmovdqa %ymm6, 5472(%r11) +vmovdqa %ymm7, 5568(%r11) +vpaddw %ymm6, %ymm7, %ymm14 +vmovdqa %ymm14, 5664(%r11) +vpaddw %ymm4, %ymm6, %ymm14 +vmovdqa %ymm14, 5760(%r11) +vpaddw %ymm5, %ymm7, %ymm15 +vmovdqa %ymm15, 5856(%r11) +vpaddw %ymm14, %ymm15, %ymm14 +vmovdqa %ymm14, 5952(%r11) +vmovdqa %ymm0, 0(%rsp) +vmovdqa %ymm1, 32(%rsp) +vmovdqa %ymm2, 64(%rsp) +vmovdqa %ymm12, 96(%rsp) +vmovdqa %ymm8, 128(%rsp) +vmovdqa %ymm9, 160(%rsp) +vmovdqa %ymm10, 192(%rsp) +vmovdqa %ymm11, 224(%rsp) +vmovdqu 704(%rdx), %ymm0 +vpaddw 0(%rsp), %ymm0, %ymm1 +vpaddw 128(%rsp), %ymm4, %ymm2 +vpaddw %ymm2, %ymm1, %ymm8 +vpsubw %ymm2, %ymm1, %ymm12 +vmovdqa %ymm0, 256(%rsp) +vmovdqu 792(%rdx), %ymm0 +vpaddw 32(%rsp), %ymm0, %ymm1 +vpaddw 160(%rsp), %ymm5, %ymm2 +vpaddw %ymm2, %ymm1, %ymm9 +vpsubw %ymm2, %ymm1, %ymm13 +vmovdqa %ymm0, 288(%rsp) +vmovdqu 880(%rdx), %ymm0 +vpaddw 64(%rsp), %ymm0, %ymm1 +vpaddw 192(%rsp), %ymm6, %ymm2 +vpaddw %ymm2, %ymm1, %ymm10 +vpsubw %ymm2, %ymm1, %ymm14 +vmovdqa %ymm0, 320(%rsp) +vmovdqu 968(%rdx), %ymm0 +vpaddw 96(%rsp), %ymm0, %ymm1 +vpaddw 224(%rsp), %ymm7, %ymm2 +vpaddw %ymm2, %ymm1, %ymm11 +vpsubw %ymm2, %ymm1, %ymm15 +vmovdqa %ymm0, 352(%rsp) +vmovdqa %ymm8, 864(%r11) +vmovdqa %ymm9, 960(%r11) +vpaddw %ymm8, %ymm9, %ymm0 +vmovdqa %ymm0, 1056(%r11) +vmovdqa %ymm10, 1152(%r11) +vmovdqa %ymm11, 1248(%r11) +vpaddw %ymm10, %ymm11, %ymm0 +vmovdqa %ymm0, 1344(%r11) +vpaddw %ymm8, %ymm10, %ymm0 +vmovdqa %ymm0, 1440(%r11) +vpaddw %ymm9, %ymm11, %ymm1 +vmovdqa %ymm1, 1536(%r11) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 1632(%r11) +vmovdqa %ymm12, 1728(%r11) +vmovdqa %ymm13, 1824(%r11) +vpaddw %ymm12, %ymm13, %ymm0 +vmovdqa %ymm0, 1920(%r11) +vmovdqa %ymm14, 2016(%r11) +vmovdqa %ymm15, 2112(%r11) +vpaddw %ymm14, %ymm15, %ymm0 +vmovdqa %ymm0, 2208(%r11) +vpaddw %ymm12, %ymm14, %ymm0 +vmovdqa %ymm0, 2304(%r11) +vpaddw %ymm13, %ymm15, %ymm1 +vmovdqa %ymm1, 2400(%r11) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 2496(%r11) +vmovdqa 256(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 0(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm4, %ymm1 +vpaddw 128(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm8 +vpsubw %ymm1, %ymm0, %ymm12 +vmovdqa 288(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 32(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm5, %ymm1 +vpaddw 160(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm9 +vpsubw %ymm1, %ymm0, %ymm13 +vmovdqa 320(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 64(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm6, %ymm1 +vpaddw 192(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm10 +vpsubw %ymm1, %ymm0, %ymm14 +vmovdqa 352(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 96(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm7, %ymm1 +vpaddw 224(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm11 +vpsubw %ymm1, %ymm0, %ymm15 +vmovdqa %ymm8, 2592(%r11) +vmovdqa %ymm9, 2688(%r11) +vpaddw %ymm8, %ymm9, %ymm0 +vmovdqa %ymm0, 2784(%r11) +vmovdqa %ymm10, 2880(%r11) +vmovdqa %ymm11, 2976(%r11) +vpaddw %ymm10, %ymm11, %ymm0 +vmovdqa %ymm0, 3072(%r11) +vpaddw %ymm8, %ymm10, %ymm0 +vmovdqa %ymm0, 3168(%r11) +vpaddw %ymm9, %ymm11, %ymm1 +vmovdqa %ymm1, 3264(%r11) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 3360(%r11) +vmovdqa %ymm12, 3456(%r11) +vmovdqa %ymm13, 3552(%r11) +vpaddw %ymm12, %ymm13, %ymm0 +vmovdqa %ymm0, 3648(%r11) +vmovdqa %ymm14, 3744(%r11) +vmovdqa %ymm15, 3840(%r11) +vpaddw %ymm14, %ymm15, %ymm0 +vmovdqa %ymm0, 3936(%r11) +vpaddw %ymm12, %ymm14, %ymm0 +vmovdqa %ymm0, 4032(%r11) +vpaddw %ymm13, %ymm15, %ymm1 +vmovdqa %ymm1, 4128(%r11) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 4224(%r11) +vpmullw %ymm3, %ymm4, %ymm0 +vpaddw 256(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 128(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 0(%rsp), %ymm0, %ymm12 +vpmullw %ymm3, %ymm5, %ymm0 +vpaddw 288(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 160(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 32(%rsp), %ymm0, %ymm13 +vpmullw %ymm3, %ymm6, %ymm0 +vpaddw 320(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 192(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 64(%rsp), %ymm0, %ymm14 +vpmullw %ymm3, %ymm7, %ymm0 +vpaddw 352(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 224(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 96(%rsp), %ymm0, %ymm15 +vmovdqa %ymm12, 4320(%r11) +vmovdqa %ymm13, 4416(%r11) +vpaddw %ymm12, %ymm13, %ymm0 +vmovdqa %ymm0, 4512(%r11) +vmovdqa %ymm14, 4608(%r11) +vmovdqa %ymm15, 4704(%r11) +vpaddw %ymm14, %ymm15, %ymm0 +vmovdqa %ymm0, 4800(%r11) +vpaddw %ymm12, %ymm14, %ymm0 +vmovdqa %ymm0, 4896(%r11) +vpaddw %ymm13, %ymm15, %ymm1 +vmovdqa %ymm1, 4992(%r11) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 5088(%r11) +vmovdqu 32(%rdx), %ymm0 +vmovdqu 120(%rdx), %ymm1 +vmovdqu 208(%rdx), %ymm2 +vmovdqu 296(%rdx), %ymm12 +vmovdqu 1088(%rdx), %ymm4 +vmovdqu 1176(%rdx), %ymm5 +vmovdqu 1264(%rdx), %ymm6 +vmovdqu 1352(%rdx), %ymm7 +vmovdqu 384(%rdx), %ymm8 +vmovdqu 472(%rdx), %ymm9 +vmovdqu 560(%rdx), %ymm10 +vmovdqu 648(%rdx), %ymm11 +vmovdqa %ymm0, 32(%r11) +vmovdqa %ymm1, 128(%r11) +vpaddw %ymm0, %ymm1, %ymm14 +vmovdqa %ymm14, 224(%r11) +vmovdqa %ymm2, 320(%r11) +vmovdqa %ymm12, 416(%r11) +vpaddw %ymm2, %ymm12, %ymm14 +vmovdqa %ymm14, 512(%r11) +vpaddw %ymm0, %ymm2, %ymm14 +vmovdqa %ymm14, 608(%r11) +vpaddw %ymm1, %ymm12, %ymm15 +vmovdqa %ymm15, 704(%r11) +vpaddw %ymm14, %ymm15, %ymm14 +vmovdqa %ymm14, 800(%r11) +vmovdqa %ymm4, 5216(%r11) +vmovdqa %ymm5, 5312(%r11) +vpaddw %ymm4, %ymm5, %ymm14 +vmovdqa %ymm14, 5408(%r11) +vmovdqa %ymm6, 5504(%r11) +vmovdqa %ymm7, 5600(%r11) +vpaddw %ymm6, %ymm7, %ymm14 +vmovdqa %ymm14, 5696(%r11) +vpaddw %ymm4, %ymm6, %ymm14 +vmovdqa %ymm14, 5792(%r11) +vpaddw %ymm5, %ymm7, %ymm15 +vmovdqa %ymm15, 5888(%r11) +vpaddw %ymm14, %ymm15, %ymm14 +vmovdqa %ymm14, 5984(%r11) +vmovdqa %ymm0, 0(%rsp) +vmovdqa %ymm1, 32(%rsp) +vmovdqa %ymm2, 64(%rsp) +vmovdqa %ymm12, 96(%rsp) +vmovdqa %ymm8, 128(%rsp) +vmovdqa %ymm9, 160(%rsp) +vmovdqa %ymm10, 192(%rsp) +vmovdqa %ymm11, 224(%rsp) +vmovdqu 736(%rdx), %ymm0 +vpaddw 0(%rsp), %ymm0, %ymm1 +vpaddw 128(%rsp), %ymm4, %ymm2 +vpaddw %ymm2, %ymm1, %ymm8 +vpsubw %ymm2, %ymm1, %ymm12 +vmovdqa %ymm0, 256(%rsp) +vmovdqu 824(%rdx), %ymm0 +vpaddw 32(%rsp), %ymm0, %ymm1 +vpaddw 160(%rsp), %ymm5, %ymm2 +vpaddw %ymm2, %ymm1, %ymm9 +vpsubw %ymm2, %ymm1, %ymm13 +vmovdqa %ymm0, 288(%rsp) +vmovdqu 912(%rdx), %ymm0 +vpaddw 64(%rsp), %ymm0, %ymm1 +vpaddw 192(%rsp), %ymm6, %ymm2 +vpaddw %ymm2, %ymm1, %ymm10 +vpsubw %ymm2, %ymm1, %ymm14 +vmovdqa %ymm0, 320(%rsp) +vmovdqu 1000(%rdx), %ymm0 +vpaddw 96(%rsp), %ymm0, %ymm1 +vpaddw 224(%rsp), %ymm7, %ymm2 +vpaddw %ymm2, %ymm1, %ymm11 +vpsubw %ymm2, %ymm1, %ymm15 +vmovdqa %ymm0, 352(%rsp) +vmovdqa %ymm8, 896(%r11) +vmovdqa %ymm9, 992(%r11) +vpaddw %ymm8, %ymm9, %ymm0 +vmovdqa %ymm0, 1088(%r11) +vmovdqa %ymm10, 1184(%r11) +vmovdqa %ymm11, 1280(%r11) +vpaddw %ymm10, %ymm11, %ymm0 +vmovdqa %ymm0, 1376(%r11) +vpaddw %ymm8, %ymm10, %ymm0 +vmovdqa %ymm0, 1472(%r11) +vpaddw %ymm9, %ymm11, %ymm1 +vmovdqa %ymm1, 1568(%r11) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 1664(%r11) +vmovdqa %ymm12, 1760(%r11) +vmovdqa %ymm13, 1856(%r11) +vpaddw %ymm12, %ymm13, %ymm0 +vmovdqa %ymm0, 1952(%r11) +vmovdqa %ymm14, 2048(%r11) +vmovdqa %ymm15, 2144(%r11) +vpaddw %ymm14, %ymm15, %ymm0 +vmovdqa %ymm0, 2240(%r11) +vpaddw %ymm12, %ymm14, %ymm0 +vmovdqa %ymm0, 2336(%r11) +vpaddw %ymm13, %ymm15, %ymm1 +vmovdqa %ymm1, 2432(%r11) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 2528(%r11) +vmovdqa 256(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 0(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm4, %ymm1 +vpaddw 128(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm8 +vpsubw %ymm1, %ymm0, %ymm12 +vmovdqa 288(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 32(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm5, %ymm1 +vpaddw 160(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm9 +vpsubw %ymm1, %ymm0, %ymm13 +vmovdqa 320(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 64(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm6, %ymm1 +vpaddw 192(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm10 +vpsubw %ymm1, %ymm0, %ymm14 +vmovdqa 352(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 96(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm7, %ymm1 +vpaddw 224(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm11 +vpsubw %ymm1, %ymm0, %ymm15 +vmovdqa %ymm8, 2624(%r11) +vmovdqa %ymm9, 2720(%r11) +vpaddw %ymm8, %ymm9, %ymm0 +vmovdqa %ymm0, 2816(%r11) +vmovdqa %ymm10, 2912(%r11) +vmovdqa %ymm11, 3008(%r11) +vpaddw %ymm10, %ymm11, %ymm0 +vmovdqa %ymm0, 3104(%r11) +vpaddw %ymm8, %ymm10, %ymm0 +vmovdqa %ymm0, 3200(%r11) +vpaddw %ymm9, %ymm11, %ymm1 +vmovdqa %ymm1, 3296(%r11) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 3392(%r11) +vmovdqa %ymm12, 3488(%r11) +vmovdqa %ymm13, 3584(%r11) +vpaddw %ymm12, %ymm13, %ymm0 +vmovdqa %ymm0, 3680(%r11) +vmovdqa %ymm14, 3776(%r11) +vmovdqa %ymm15, 3872(%r11) +vpaddw %ymm14, %ymm15, %ymm0 +vmovdqa %ymm0, 3968(%r11) +vpaddw %ymm12, %ymm14, %ymm0 +vmovdqa %ymm0, 4064(%r11) +vpaddw %ymm13, %ymm15, %ymm1 +vmovdqa %ymm1, 4160(%r11) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 4256(%r11) +vpmullw %ymm3, %ymm4, %ymm0 +vpaddw 256(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 128(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 0(%rsp), %ymm0, %ymm12 +vpmullw %ymm3, %ymm5, %ymm0 +vpaddw 288(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 160(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 32(%rsp), %ymm0, %ymm13 +vpmullw %ymm3, %ymm6, %ymm0 +vpaddw 320(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 192(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 64(%rsp), %ymm0, %ymm14 +vpmullw %ymm3, %ymm7, %ymm0 +vpaddw 352(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 224(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 96(%rsp), %ymm0, %ymm15 +vmovdqa %ymm12, 4352(%r11) +vmovdqa %ymm13, 4448(%r11) +vpaddw %ymm12, %ymm13, %ymm0 +vmovdqa %ymm0, 4544(%r11) +vmovdqa %ymm14, 4640(%r11) +vmovdqa %ymm15, 4736(%r11) +vpaddw %ymm14, %ymm15, %ymm0 +vmovdqa %ymm0, 4832(%r11) +vpaddw %ymm12, %ymm14, %ymm0 +vmovdqa %ymm0, 4928(%r11) +vpaddw %ymm13, %ymm15, %ymm1 +vmovdqa %ymm1, 5024(%r11) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 5120(%r11) +vmovdqu 64(%rdx), %ymm0 +vmovdqu 152(%rdx), %ymm1 +vmovdqu 240(%rdx), %ymm2 +vmovdqu 328(%rdx), %ymm12 +vmovdqu 1120(%rdx), %ymm4 +vmovdqu 1208(%rdx), %ymm5 +vmovdqu 1296(%rdx), %ymm6 +vmovdqu 1384(%rdx), %ymm7 +vpand mask_low9words(%rip), %ymm7, %ymm7 +vmovdqu 416(%rdx), %ymm8 +vmovdqu 504(%rdx), %ymm9 +vmovdqu 592(%rdx), %ymm10 +vmovdqu 680(%rdx), %ymm11 +vmovdqa %ymm0, 64(%r11) +vmovdqa %ymm1, 160(%r11) +vpaddw %ymm0, %ymm1, %ymm14 +vmovdqa %ymm14, 256(%r11) +vmovdqa %ymm2, 352(%r11) +vmovdqa %ymm12, 448(%r11) +vpaddw %ymm2, %ymm12, %ymm14 +vmovdqa %ymm14, 544(%r11) +vpaddw %ymm0, %ymm2, %ymm14 +vmovdqa %ymm14, 640(%r11) +vpaddw %ymm1, %ymm12, %ymm15 +vmovdqa %ymm15, 736(%r11) +vpaddw %ymm14, %ymm15, %ymm14 +vmovdqa %ymm14, 832(%r11) +vmovdqa %ymm4, 5248(%r11) +vmovdqa %ymm5, 5344(%r11) +vpaddw %ymm4, %ymm5, %ymm14 +vmovdqa %ymm14, 5440(%r11) +vmovdqa %ymm6, 5536(%r11) +vmovdqa %ymm7, 5632(%r11) +vpaddw %ymm6, %ymm7, %ymm14 +vmovdqa %ymm14, 5728(%r11) +vpaddw %ymm4, %ymm6, %ymm14 +vmovdqa %ymm14, 5824(%r11) +vpaddw %ymm5, %ymm7, %ymm15 +vmovdqa %ymm15, 5920(%r11) +vpaddw %ymm14, %ymm15, %ymm14 +vmovdqa %ymm14, 6016(%r11) +vmovdqa %ymm0, 0(%rsp) +vmovdqa %ymm1, 32(%rsp) +vmovdqa %ymm2, 64(%rsp) +vmovdqa %ymm12, 96(%rsp) +vmovdqa %ymm8, 128(%rsp) +vmovdqa %ymm9, 160(%rsp) +vmovdqa %ymm10, 192(%rsp) +vmovdqa %ymm11, 224(%rsp) +vmovdqu 768(%rdx), %ymm0 +vpaddw 0(%rsp), %ymm0, %ymm1 +vpaddw 128(%rsp), %ymm4, %ymm2 +vpaddw %ymm2, %ymm1, %ymm8 +vpsubw %ymm2, %ymm1, %ymm12 +vmovdqa %ymm0, 256(%rsp) +vmovdqu 856(%rdx), %ymm0 +vpaddw 32(%rsp), %ymm0, %ymm1 +vpaddw 160(%rsp), %ymm5, %ymm2 +vpaddw %ymm2, %ymm1, %ymm9 +vpsubw %ymm2, %ymm1, %ymm13 +vmovdqa %ymm0, 288(%rsp) +vmovdqu 944(%rdx), %ymm0 +vpaddw 64(%rsp), %ymm0, %ymm1 +vpaddw 192(%rsp), %ymm6, %ymm2 +vpaddw %ymm2, %ymm1, %ymm10 +vpsubw %ymm2, %ymm1, %ymm14 +vmovdqa %ymm0, 320(%rsp) +vmovdqu 1032(%rdx), %ymm0 +vpaddw 96(%rsp), %ymm0, %ymm1 +vpaddw 224(%rsp), %ymm7, %ymm2 +vpaddw %ymm2, %ymm1, %ymm11 +vpsubw %ymm2, %ymm1, %ymm15 +vmovdqa %ymm0, 352(%rsp) +vmovdqa %ymm8, 928(%r11) +vmovdqa %ymm9, 1024(%r11) +vpaddw %ymm8, %ymm9, %ymm0 +vmovdqa %ymm0, 1120(%r11) +vmovdqa %ymm10, 1216(%r11) +vmovdqa %ymm11, 1312(%r11) +vpaddw %ymm10, %ymm11, %ymm0 +vmovdqa %ymm0, 1408(%r11) +vpaddw %ymm8, %ymm10, %ymm0 +vmovdqa %ymm0, 1504(%r11) +vpaddw %ymm9, %ymm11, %ymm1 +vmovdqa %ymm1, 1600(%r11) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 1696(%r11) +vmovdqa %ymm12, 1792(%r11) +vmovdqa %ymm13, 1888(%r11) +vpaddw %ymm12, %ymm13, %ymm0 +vmovdqa %ymm0, 1984(%r11) +vmovdqa %ymm14, 2080(%r11) +vmovdqa %ymm15, 2176(%r11) +vpaddw %ymm14, %ymm15, %ymm0 +vmovdqa %ymm0, 2272(%r11) +vpaddw %ymm12, %ymm14, %ymm0 +vmovdqa %ymm0, 2368(%r11) +vpaddw %ymm13, %ymm15, %ymm1 +vmovdqa %ymm1, 2464(%r11) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 2560(%r11) +vmovdqa 256(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 0(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm4, %ymm1 +vpaddw 128(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm8 +vpsubw %ymm1, %ymm0, %ymm12 +vmovdqa 288(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 32(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm5, %ymm1 +vpaddw 160(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm9 +vpsubw %ymm1, %ymm0, %ymm13 +vmovdqa 320(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 64(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm6, %ymm1 +vpaddw 192(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm10 +vpsubw %ymm1, %ymm0, %ymm14 +vmovdqa 352(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 96(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm7, %ymm1 +vpaddw 224(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm11 +vpsubw %ymm1, %ymm0, %ymm15 +vmovdqa %ymm8, 2656(%r11) +vmovdqa %ymm9, 2752(%r11) +vpaddw %ymm8, %ymm9, %ymm0 +vmovdqa %ymm0, 2848(%r11) +vmovdqa %ymm10, 2944(%r11) +vmovdqa %ymm11, 3040(%r11) +vpaddw %ymm10, %ymm11, %ymm0 +vmovdqa %ymm0, 3136(%r11) +vpaddw %ymm8, %ymm10, %ymm0 +vmovdqa %ymm0, 3232(%r11) +vpaddw %ymm9, %ymm11, %ymm1 +vmovdqa %ymm1, 3328(%r11) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 3424(%r11) +vmovdqa %ymm12, 3520(%r11) +vmovdqa %ymm13, 3616(%r11) +vpaddw %ymm12, %ymm13, %ymm0 +vmovdqa %ymm0, 3712(%r11) +vmovdqa %ymm14, 3808(%r11) +vmovdqa %ymm15, 3904(%r11) +vpaddw %ymm14, %ymm15, %ymm0 +vmovdqa %ymm0, 4000(%r11) +vpaddw %ymm12, %ymm14, %ymm0 +vmovdqa %ymm0, 4096(%r11) +vpaddw %ymm13, %ymm15, %ymm1 +vmovdqa %ymm1, 4192(%r11) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 4288(%r11) +vpmullw %ymm3, %ymm4, %ymm0 +vpaddw 256(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 128(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 0(%rsp), %ymm0, %ymm12 +vpmullw %ymm3, %ymm5, %ymm0 +vpaddw 288(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 160(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 32(%rsp), %ymm0, %ymm13 +vpmullw %ymm3, %ymm6, %ymm0 +vpaddw 320(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 192(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 64(%rsp), %ymm0, %ymm14 +vpmullw %ymm3, %ymm7, %ymm0 +vpaddw 352(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 224(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 96(%rsp), %ymm0, %ymm15 +vmovdqa %ymm12, 4384(%r11) +vmovdqa %ymm13, 4480(%r11) +vpaddw %ymm12, %ymm13, %ymm0 +vmovdqa %ymm0, 4576(%r11) +vmovdqa %ymm14, 4672(%r11) +vmovdqa %ymm15, 4768(%r11) +vpaddw %ymm14, %ymm15, %ymm0 +vmovdqa %ymm0, 4864(%r11) +vpaddw %ymm12, %ymm14, %ymm0 +vmovdqa %ymm0, 4960(%r11) +vpaddw %ymm13, %ymm15, %ymm1 +vmovdqa %ymm1, 5056(%r11) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 5152(%r11) +subq $9408, %rsp +mov $4, %ecx +karatsuba_loop_4eced63f144beffcb0247f9c6f67d165: +mov %rsp, %r9 +mov %rsp, %r10 +subq $32, %rsp +vmovdqa 0(%rax), %ymm0 +vmovdqa 192(%rax), %ymm1 +vmovdqa 384(%rax), %ymm2 +vmovdqa 576(%rax), %ymm3 +vpunpcklwd 96(%rax), %ymm0, %ymm4 +vpunpckhwd 96(%rax), %ymm0, %ymm5 +vpunpcklwd 288(%rax), %ymm1, %ymm6 +vpunpckhwd 288(%rax), %ymm1, %ymm7 +vpunpcklwd 480(%rax), %ymm2, %ymm8 +vpunpckhwd 480(%rax), %ymm2, %ymm9 +vpunpcklwd 672(%rax), %ymm3, %ymm10 +vpunpckhwd 672(%rax), %ymm3, %ymm11 +vpunpckldq %ymm6, %ymm4, %ymm0 +vpunpckhdq %ymm6, %ymm4, %ymm1 +vpunpckldq %ymm7, %ymm5, %ymm2 +vpunpckhdq %ymm7, %ymm5, %ymm3 +vpunpckldq %ymm10, %ymm8, %ymm12 +vpunpckhdq %ymm10, %ymm8, %ymm13 +vpunpckldq %ymm11, %ymm9, %ymm14 +vpunpckhdq %ymm11, %ymm9, %ymm15 +vpunpcklqdq %ymm12, %ymm0, %ymm4 +vpunpckhqdq %ymm12, %ymm0, %ymm5 +vpunpcklqdq %ymm13, %ymm1, %ymm6 +vpunpckhqdq %ymm13, %ymm1, %ymm7 +vpunpcklqdq %ymm14, %ymm2, %ymm8 +vpunpckhqdq %ymm14, %ymm2, %ymm9 +vpunpcklqdq %ymm15, %ymm3, %ymm10 +vpunpckhqdq %ymm15, %ymm3, %ymm11 +vmovdqa 768(%rax), %ymm0 +vmovdqa 960(%rax), %ymm1 +vmovdqa 1152(%rax), %ymm2 +vmovdqa 1344(%rax), %ymm3 +vpunpcklwd 864(%rax), %ymm0, %ymm12 +vpunpckhwd 864(%rax), %ymm0, %ymm13 +vpunpcklwd 1056(%rax), %ymm1, %ymm14 +vpunpckhwd 1056(%rax), %ymm1, %ymm15 +vpunpcklwd 1248(%rax), %ymm2, %ymm0 +vpunpckhwd 1248(%rax), %ymm2, %ymm1 +vpunpcklwd 1440(%rax), %ymm3, %ymm2 +vpunpckhwd 1440(%rax), %ymm3, %ymm3 +vmovdqa %ymm11, 0(%rsp) +vpunpckldq %ymm14, %ymm12, %ymm11 +vpunpckhdq %ymm14, %ymm12, %ymm12 +vpunpckldq %ymm15, %ymm13, %ymm14 +vpunpckhdq %ymm15, %ymm13, %ymm15 +vpunpckldq %ymm2, %ymm0, %ymm13 +vpunpckhdq %ymm2, %ymm0, %ymm0 +vpunpckldq %ymm3, %ymm1, %ymm2 +vpunpckhdq %ymm3, %ymm1, %ymm1 +vpunpcklqdq %ymm13, %ymm11, %ymm3 +vpunpckhqdq %ymm13, %ymm11, %ymm13 +vpunpcklqdq %ymm0, %ymm12, %ymm11 +vpunpckhqdq %ymm0, %ymm12, %ymm0 +vpunpcklqdq %ymm2, %ymm14, %ymm12 +vpunpckhqdq %ymm2, %ymm14, %ymm2 +vpunpcklqdq %ymm1, %ymm15, %ymm14 +vpunpckhqdq %ymm1, %ymm15, %ymm1 +vinserti128 $1, %xmm3, %ymm4, %ymm15 +vmovdqa %ymm15, 0(%r9) +vinserti128 $1, %xmm13, %ymm5, %ymm15 +vmovdqa %ymm15, 32(%r9) +vinserti128 $1, %xmm11, %ymm6, %ymm15 +vmovdqa %ymm15, 64(%r9) +vinserti128 $1, %xmm0, %ymm7, %ymm15 +vmovdqa %ymm15, 96(%r9) +vinserti128 $1, %xmm12, %ymm8, %ymm15 +vmovdqa %ymm15, 128(%r9) +vinserti128 $1, %xmm2, %ymm9, %ymm15 +vmovdqa %ymm15, 160(%r9) +vinserti128 $1, %xmm14, %ymm10, %ymm15 +vmovdqa %ymm15, 192(%r9) +vpermq $78, %ymm4, %ymm4 +vpermq $78, %ymm5, %ymm5 +vpermq $78, %ymm6, %ymm6 +vpermq $78, %ymm7, %ymm7 +vpermq $78, %ymm8, %ymm8 +vpermq $78, %ymm9, %ymm9 +vpermq $78, %ymm10, %ymm10 +vinserti128 $0, %xmm4, %ymm3, %ymm15 +vmovdqa %ymm15, 256(%r9) +vinserti128 $0, %xmm5, %ymm13, %ymm15 +vmovdqa %ymm15, 288(%r9) +vinserti128 $0, %xmm6, %ymm11, %ymm15 +vmovdqa %ymm15, 320(%r9) +vinserti128 $0, %xmm7, %ymm0, %ymm15 +vmovdqa %ymm15, 352(%r9) +vinserti128 $0, %xmm8, %ymm12, %ymm15 +vmovdqa %ymm15, 384(%r9) +vinserti128 $0, %xmm9, %ymm2, %ymm15 +vmovdqa %ymm15, 416(%r9) +vinserti128 $0, %xmm10, %ymm14, %ymm15 +vmovdqa %ymm15, 448(%r9) +vmovdqa 0(%rsp), %ymm11 +vinserti128 $1, %xmm1, %ymm11, %ymm14 +vmovdqa %ymm14, 224(%r9) +vpermq $78, %ymm11, %ymm11 +vinserti128 $0, %xmm11, %ymm1, %ymm1 +vmovdqa %ymm1, 480(%r9) +vmovdqa 32(%rax), %ymm0 +vmovdqa 224(%rax), %ymm1 +vmovdqa 416(%rax), %ymm2 +vmovdqa 608(%rax), %ymm3 +vpunpcklwd 128(%rax), %ymm0, %ymm4 +vpunpckhwd 128(%rax), %ymm0, %ymm5 +vpunpcklwd 320(%rax), %ymm1, %ymm6 +vpunpckhwd 320(%rax), %ymm1, %ymm7 +vpunpcklwd 512(%rax), %ymm2, %ymm8 +vpunpckhwd 512(%rax), %ymm2, %ymm9 +vpunpcklwd 704(%rax), %ymm3, %ymm10 +vpunpckhwd 704(%rax), %ymm3, %ymm11 +vpunpckldq %ymm6, %ymm4, %ymm0 +vpunpckhdq %ymm6, %ymm4, %ymm1 +vpunpckldq %ymm7, %ymm5, %ymm2 +vpunpckhdq %ymm7, %ymm5, %ymm3 +vpunpckldq %ymm10, %ymm8, %ymm12 +vpunpckhdq %ymm10, %ymm8, %ymm13 +vpunpckldq %ymm11, %ymm9, %ymm14 +vpunpckhdq %ymm11, %ymm9, %ymm15 +vpunpcklqdq %ymm12, %ymm0, %ymm4 +vpunpckhqdq %ymm12, %ymm0, %ymm5 +vpunpcklqdq %ymm13, %ymm1, %ymm6 +vpunpckhqdq %ymm13, %ymm1, %ymm7 +vpunpcklqdq %ymm14, %ymm2, %ymm8 +vpunpckhqdq %ymm14, %ymm2, %ymm9 +vpunpcklqdq %ymm15, %ymm3, %ymm10 +vpunpckhqdq %ymm15, %ymm3, %ymm11 +vmovdqa 800(%rax), %ymm0 +vmovdqa 992(%rax), %ymm1 +vmovdqa 1184(%rax), %ymm2 +vmovdqa 1376(%rax), %ymm3 +vpunpcklwd 896(%rax), %ymm0, %ymm12 +vpunpckhwd 896(%rax), %ymm0, %ymm13 +vpunpcklwd 1088(%rax), %ymm1, %ymm14 +vpunpckhwd 1088(%rax), %ymm1, %ymm15 +vpunpcklwd 1280(%rax), %ymm2, %ymm0 +vpunpckhwd 1280(%rax), %ymm2, %ymm1 +vpunpcklwd 1472(%rax), %ymm3, %ymm2 +vpunpckhwd 1472(%rax), %ymm3, %ymm3 +vmovdqa %ymm11, 0(%rsp) +vpunpckldq %ymm14, %ymm12, %ymm11 +vpunpckhdq %ymm14, %ymm12, %ymm12 +vpunpckldq %ymm15, %ymm13, %ymm14 +vpunpckhdq %ymm15, %ymm13, %ymm15 +vpunpckldq %ymm2, %ymm0, %ymm13 +vpunpckhdq %ymm2, %ymm0, %ymm0 +vpunpckldq %ymm3, %ymm1, %ymm2 +vpunpckhdq %ymm3, %ymm1, %ymm1 +vpunpcklqdq %ymm13, %ymm11, %ymm3 +vpunpckhqdq %ymm13, %ymm11, %ymm13 +vpunpcklqdq %ymm0, %ymm12, %ymm11 +vpunpckhqdq %ymm0, %ymm12, %ymm0 +vpunpcklqdq %ymm2, %ymm14, %ymm12 +vpunpckhqdq %ymm2, %ymm14, %ymm2 +vpunpcklqdq %ymm1, %ymm15, %ymm14 +vpunpckhqdq %ymm1, %ymm15, %ymm1 +vinserti128 $1, %xmm3, %ymm4, %ymm15 +vmovdqa %ymm15, 512(%r9) +vinserti128 $1, %xmm13, %ymm5, %ymm15 +vmovdqa %ymm15, 544(%r9) +vinserti128 $1, %xmm11, %ymm6, %ymm15 +vmovdqa %ymm15, 576(%r9) +vinserti128 $1, %xmm0, %ymm7, %ymm15 +vmovdqa %ymm15, 608(%r9) +vinserti128 $1, %xmm12, %ymm8, %ymm15 +vmovdqa %ymm15, 640(%r9) +vinserti128 $1, %xmm2, %ymm9, %ymm15 +vmovdqa %ymm15, 672(%r9) +vinserti128 $1, %xmm14, %ymm10, %ymm15 +vmovdqa %ymm15, 704(%r9) +vpermq $78, %ymm4, %ymm4 +vpermq $78, %ymm5, %ymm5 +vpermq $78, %ymm6, %ymm6 +vpermq $78, %ymm7, %ymm7 +vpermq $78, %ymm8, %ymm8 +vpermq $78, %ymm9, %ymm9 +vpermq $78, %ymm10, %ymm10 +vinserti128 $0, %xmm4, %ymm3, %ymm15 +vmovdqa %ymm15, 768(%r9) +vinserti128 $0, %xmm5, %ymm13, %ymm15 +vmovdqa %ymm15, 800(%r9) +vinserti128 $0, %xmm6, %ymm11, %ymm15 +vmovdqa %ymm15, 832(%r9) +vinserti128 $0, %xmm7, %ymm0, %ymm15 +vmovdqa %ymm15, 864(%r9) +vinserti128 $0, %xmm8, %ymm12, %ymm15 +vmovdqa %ymm15, 896(%r9) +vinserti128 $0, %xmm9, %ymm2, %ymm15 +vmovdqa %ymm15, 928(%r9) +vinserti128 $0, %xmm10, %ymm14, %ymm15 +vmovdqa %ymm15, 960(%r9) +vmovdqa 0(%rsp), %ymm11 +vinserti128 $1, %xmm1, %ymm11, %ymm14 +vmovdqa %ymm14, 736(%r9) +vpermq $78, %ymm11, %ymm11 +vinserti128 $0, %xmm11, %ymm1, %ymm1 +vmovdqa %ymm1, 992(%r9) +vmovdqa 64(%rax), %ymm0 +vmovdqa 256(%rax), %ymm1 +vmovdqa 448(%rax), %ymm2 +vmovdqa 640(%rax), %ymm3 +vpunpcklwd 160(%rax), %ymm0, %ymm4 +vpunpckhwd 160(%rax), %ymm0, %ymm5 +vpunpcklwd 352(%rax), %ymm1, %ymm6 +vpunpckhwd 352(%rax), %ymm1, %ymm7 +vpunpcklwd 544(%rax), %ymm2, %ymm8 +vpunpckhwd 544(%rax), %ymm2, %ymm9 +vpunpcklwd 736(%rax), %ymm3, %ymm10 +vpunpckhwd 736(%rax), %ymm3, %ymm11 +vpunpckldq %ymm6, %ymm4, %ymm0 +vpunpckhdq %ymm6, %ymm4, %ymm1 +vpunpckldq %ymm7, %ymm5, %ymm2 +vpunpckhdq %ymm7, %ymm5, %ymm3 +vpunpckldq %ymm10, %ymm8, %ymm12 +vpunpckhdq %ymm10, %ymm8, %ymm13 +vpunpckldq %ymm11, %ymm9, %ymm14 +vpunpckhdq %ymm11, %ymm9, %ymm15 +vpunpcklqdq %ymm12, %ymm0, %ymm4 +vpunpckhqdq %ymm12, %ymm0, %ymm5 +vpunpcklqdq %ymm13, %ymm1, %ymm6 +vpunpckhqdq %ymm13, %ymm1, %ymm7 +vpunpcklqdq %ymm14, %ymm2, %ymm8 +vpunpckhqdq %ymm14, %ymm2, %ymm9 +vpunpcklqdq %ymm15, %ymm3, %ymm10 +vpunpckhqdq %ymm15, %ymm3, %ymm11 +vmovdqa 832(%rax), %ymm0 +vmovdqa 1024(%rax), %ymm1 +vmovdqa 1216(%rax), %ymm2 +vmovdqa 1408(%rax), %ymm3 +vpunpcklwd 928(%rax), %ymm0, %ymm12 +vpunpckhwd 928(%rax), %ymm0, %ymm13 +vpunpcklwd 1120(%rax), %ymm1, %ymm14 +vpunpckhwd 1120(%rax), %ymm1, %ymm15 +vpunpcklwd 1312(%rax), %ymm2, %ymm0 +vpunpckhwd 1312(%rax), %ymm2, %ymm1 +vpunpcklwd 1504(%rax), %ymm3, %ymm2 +vpunpckhwd 1504(%rax), %ymm3, %ymm3 +vmovdqa %ymm11, 0(%rsp) +vpunpckldq %ymm14, %ymm12, %ymm11 +vpunpckhdq %ymm14, %ymm12, %ymm12 +vpunpckldq %ymm15, %ymm13, %ymm14 +vpunpckhdq %ymm15, %ymm13, %ymm15 +vpunpckldq %ymm2, %ymm0, %ymm13 +vpunpckhdq %ymm2, %ymm0, %ymm0 +vpunpckldq %ymm3, %ymm1, %ymm2 +vpunpckhdq %ymm3, %ymm1, %ymm1 +vpunpcklqdq %ymm13, %ymm11, %ymm3 +vpunpckhqdq %ymm13, %ymm11, %ymm13 +vpunpcklqdq %ymm0, %ymm12, %ymm11 +vpunpckhqdq %ymm0, %ymm12, %ymm0 +vpunpcklqdq %ymm2, %ymm14, %ymm12 +vpunpckhqdq %ymm2, %ymm14, %ymm2 +vpunpcklqdq %ymm1, %ymm15, %ymm14 +vpunpckhqdq %ymm1, %ymm15, %ymm1 +vinserti128 $1, %xmm3, %ymm4, %ymm15 +vmovdqa %ymm15, 1024(%r9) +vinserti128 $1, %xmm13, %ymm5, %ymm15 +vmovdqa %ymm15, 1056(%r9) +vinserti128 $1, %xmm11, %ymm6, %ymm15 +vmovdqa %ymm15, 1088(%r9) +vinserti128 $1, %xmm0, %ymm7, %ymm15 +vmovdqa %ymm15, 1120(%r9) +vinserti128 $1, %xmm12, %ymm8, %ymm15 +vmovdqa %ymm15, 1152(%r9) +vinserti128 $1, %xmm2, %ymm9, %ymm15 +vmovdqa %ymm15, 1184(%r9) +vinserti128 $1, %xmm14, %ymm10, %ymm15 +vmovdqa %ymm15, 1216(%r9) +vpermq $78, %ymm4, %ymm4 +vpermq $78, %ymm5, %ymm5 +vpermq $78, %ymm6, %ymm6 +vpermq $78, %ymm7, %ymm7 +vpermq $78, %ymm8, %ymm8 +vpermq $78, %ymm9, %ymm9 +vpermq $78, %ymm10, %ymm10 +vinserti128 $0, %xmm4, %ymm3, %ymm15 +vmovdqa %ymm15, 1280(%r9) +vinserti128 $0, %xmm5, %ymm13, %ymm15 +vmovdqa %ymm15, 1312(%r9) +vinserti128 $0, %xmm6, %ymm11, %ymm15 +vmovdqa %ymm15, 1344(%r9) +vinserti128 $0, %xmm7, %ymm0, %ymm15 +vmovdqa %ymm15, 1376(%r9) +vmovdqa 0(%rsp), %ymm11 +vinserti128 $1, %xmm1, %ymm11, %ymm14 +vmovdqa %ymm14, 1248(%r9) +addq $32, %rsp +subq $32, %rsp +vmovdqa 0(%r11), %ymm0 +vmovdqa 192(%r11), %ymm1 +vmovdqa 384(%r11), %ymm2 +vmovdqa 576(%r11), %ymm3 +vpunpcklwd 96(%r11), %ymm0, %ymm4 +vpunpckhwd 96(%r11), %ymm0, %ymm5 +vpunpcklwd 288(%r11), %ymm1, %ymm6 +vpunpckhwd 288(%r11), %ymm1, %ymm7 +vpunpcklwd 480(%r11), %ymm2, %ymm8 +vpunpckhwd 480(%r11), %ymm2, %ymm9 +vpunpcklwd 672(%r11), %ymm3, %ymm10 +vpunpckhwd 672(%r11), %ymm3, %ymm11 +vpunpckldq %ymm6, %ymm4, %ymm0 +vpunpckhdq %ymm6, %ymm4, %ymm1 +vpunpckldq %ymm7, %ymm5, %ymm2 +vpunpckhdq %ymm7, %ymm5, %ymm3 +vpunpckldq %ymm10, %ymm8, %ymm12 +vpunpckhdq %ymm10, %ymm8, %ymm13 +vpunpckldq %ymm11, %ymm9, %ymm14 +vpunpckhdq %ymm11, %ymm9, %ymm15 +vpunpcklqdq %ymm12, %ymm0, %ymm4 +vpunpckhqdq %ymm12, %ymm0, %ymm5 +vpunpcklqdq %ymm13, %ymm1, %ymm6 +vpunpckhqdq %ymm13, %ymm1, %ymm7 +vpunpcklqdq %ymm14, %ymm2, %ymm8 +vpunpckhqdq %ymm14, %ymm2, %ymm9 +vpunpcklqdq %ymm15, %ymm3, %ymm10 +vpunpckhqdq %ymm15, %ymm3, %ymm11 +vmovdqa 768(%r11), %ymm0 +vmovdqa 960(%r11), %ymm1 +vmovdqa 1152(%r11), %ymm2 +vmovdqa 1344(%r11), %ymm3 +vpunpcklwd 864(%r11), %ymm0, %ymm12 +vpunpckhwd 864(%r11), %ymm0, %ymm13 +vpunpcklwd 1056(%r11), %ymm1, %ymm14 +vpunpckhwd 1056(%r11), %ymm1, %ymm15 +vpunpcklwd 1248(%r11), %ymm2, %ymm0 +vpunpckhwd 1248(%r11), %ymm2, %ymm1 +vpunpcklwd 1440(%r11), %ymm3, %ymm2 +vpunpckhwd 1440(%r11), %ymm3, %ymm3 +vmovdqa %ymm11, 0(%rsp) +vpunpckldq %ymm14, %ymm12, %ymm11 +vpunpckhdq %ymm14, %ymm12, %ymm12 +vpunpckldq %ymm15, %ymm13, %ymm14 +vpunpckhdq %ymm15, %ymm13, %ymm15 +vpunpckldq %ymm2, %ymm0, %ymm13 +vpunpckhdq %ymm2, %ymm0, %ymm0 +vpunpckldq %ymm3, %ymm1, %ymm2 +vpunpckhdq %ymm3, %ymm1, %ymm1 +vpunpcklqdq %ymm13, %ymm11, %ymm3 +vpunpckhqdq %ymm13, %ymm11, %ymm13 +vpunpcklqdq %ymm0, %ymm12, %ymm11 +vpunpckhqdq %ymm0, %ymm12, %ymm0 +vpunpcklqdq %ymm2, %ymm14, %ymm12 +vpunpckhqdq %ymm2, %ymm14, %ymm2 +vpunpcklqdq %ymm1, %ymm15, %ymm14 +vpunpckhqdq %ymm1, %ymm15, %ymm1 +vinserti128 $1, %xmm3, %ymm4, %ymm15 +vmovdqa %ymm15, 1408(%r9) +vinserti128 $1, %xmm13, %ymm5, %ymm15 +vmovdqa %ymm15, 1440(%r9) +vinserti128 $1, %xmm11, %ymm6, %ymm15 +vmovdqa %ymm15, 1472(%r9) +vinserti128 $1, %xmm0, %ymm7, %ymm15 +vmovdqa %ymm15, 1504(%r9) +vinserti128 $1, %xmm12, %ymm8, %ymm15 +vmovdqa %ymm15, 1536(%r9) +vinserti128 $1, %xmm2, %ymm9, %ymm15 +vmovdqa %ymm15, 1568(%r9) +vinserti128 $1, %xmm14, %ymm10, %ymm15 +vmovdqa %ymm15, 1600(%r9) +vpermq $78, %ymm4, %ymm4 +vpermq $78, %ymm5, %ymm5 +vpermq $78, %ymm6, %ymm6 +vpermq $78, %ymm7, %ymm7 +vpermq $78, %ymm8, %ymm8 +vpermq $78, %ymm9, %ymm9 +vpermq $78, %ymm10, %ymm10 +vinserti128 $0, %xmm4, %ymm3, %ymm15 +vmovdqa %ymm15, 1664(%r9) +vinserti128 $0, %xmm5, %ymm13, %ymm15 +vmovdqa %ymm15, 1696(%r9) +vinserti128 $0, %xmm6, %ymm11, %ymm15 +vmovdqa %ymm15, 1728(%r9) +vinserti128 $0, %xmm7, %ymm0, %ymm15 +vmovdqa %ymm15, 1760(%r9) +vinserti128 $0, %xmm8, %ymm12, %ymm15 +vmovdqa %ymm15, 1792(%r9) +vinserti128 $0, %xmm9, %ymm2, %ymm15 +vmovdqa %ymm15, 1824(%r9) +vinserti128 $0, %xmm10, %ymm14, %ymm15 +vmovdqa %ymm15, 1856(%r9) +vmovdqa 0(%rsp), %ymm11 +vinserti128 $1, %xmm1, %ymm11, %ymm14 +vmovdqa %ymm14, 1632(%r9) +vpermq $78, %ymm11, %ymm11 +vinserti128 $0, %xmm11, %ymm1, %ymm1 +vmovdqa %ymm1, 1888(%r9) +vmovdqa 32(%r11), %ymm0 +vmovdqa 224(%r11), %ymm1 +vmovdqa 416(%r11), %ymm2 +vmovdqa 608(%r11), %ymm3 +vpunpcklwd 128(%r11), %ymm0, %ymm4 +vpunpckhwd 128(%r11), %ymm0, %ymm5 +vpunpcklwd 320(%r11), %ymm1, %ymm6 +vpunpckhwd 320(%r11), %ymm1, %ymm7 +vpunpcklwd 512(%r11), %ymm2, %ymm8 +vpunpckhwd 512(%r11), %ymm2, %ymm9 +vpunpcklwd 704(%r11), %ymm3, %ymm10 +vpunpckhwd 704(%r11), %ymm3, %ymm11 +vpunpckldq %ymm6, %ymm4, %ymm0 +vpunpckhdq %ymm6, %ymm4, %ymm1 +vpunpckldq %ymm7, %ymm5, %ymm2 +vpunpckhdq %ymm7, %ymm5, %ymm3 +vpunpckldq %ymm10, %ymm8, %ymm12 +vpunpckhdq %ymm10, %ymm8, %ymm13 +vpunpckldq %ymm11, %ymm9, %ymm14 +vpunpckhdq %ymm11, %ymm9, %ymm15 +vpunpcklqdq %ymm12, %ymm0, %ymm4 +vpunpckhqdq %ymm12, %ymm0, %ymm5 +vpunpcklqdq %ymm13, %ymm1, %ymm6 +vpunpckhqdq %ymm13, %ymm1, %ymm7 +vpunpcklqdq %ymm14, %ymm2, %ymm8 +vpunpckhqdq %ymm14, %ymm2, %ymm9 +vpunpcklqdq %ymm15, %ymm3, %ymm10 +vpunpckhqdq %ymm15, %ymm3, %ymm11 +vmovdqa 800(%r11), %ymm0 +vmovdqa 992(%r11), %ymm1 +vmovdqa 1184(%r11), %ymm2 +vmovdqa 1376(%r11), %ymm3 +vpunpcklwd 896(%r11), %ymm0, %ymm12 +vpunpckhwd 896(%r11), %ymm0, %ymm13 +vpunpcklwd 1088(%r11), %ymm1, %ymm14 +vpunpckhwd 1088(%r11), %ymm1, %ymm15 +vpunpcklwd 1280(%r11), %ymm2, %ymm0 +vpunpckhwd 1280(%r11), %ymm2, %ymm1 +vpunpcklwd 1472(%r11), %ymm3, %ymm2 +vpunpckhwd 1472(%r11), %ymm3, %ymm3 +vmovdqa %ymm11, 0(%rsp) +vpunpckldq %ymm14, %ymm12, %ymm11 +vpunpckhdq %ymm14, %ymm12, %ymm12 +vpunpckldq %ymm15, %ymm13, %ymm14 +vpunpckhdq %ymm15, %ymm13, %ymm15 +vpunpckldq %ymm2, %ymm0, %ymm13 +vpunpckhdq %ymm2, %ymm0, %ymm0 +vpunpckldq %ymm3, %ymm1, %ymm2 +vpunpckhdq %ymm3, %ymm1, %ymm1 +vpunpcklqdq %ymm13, %ymm11, %ymm3 +vpunpckhqdq %ymm13, %ymm11, %ymm13 +vpunpcklqdq %ymm0, %ymm12, %ymm11 +vpunpckhqdq %ymm0, %ymm12, %ymm0 +vpunpcklqdq %ymm2, %ymm14, %ymm12 +vpunpckhqdq %ymm2, %ymm14, %ymm2 +vpunpcklqdq %ymm1, %ymm15, %ymm14 +vpunpckhqdq %ymm1, %ymm15, %ymm1 +vinserti128 $1, %xmm3, %ymm4, %ymm15 +vmovdqa %ymm15, 1920(%r9) +vinserti128 $1, %xmm13, %ymm5, %ymm15 +vmovdqa %ymm15, 1952(%r9) +vinserti128 $1, %xmm11, %ymm6, %ymm15 +vmovdqa %ymm15, 1984(%r9) +vinserti128 $1, %xmm0, %ymm7, %ymm15 +vmovdqa %ymm15, 2016(%r9) +vinserti128 $1, %xmm12, %ymm8, %ymm15 +vmovdqa %ymm15, 2048(%r9) +vinserti128 $1, %xmm2, %ymm9, %ymm15 +vmovdqa %ymm15, 2080(%r9) +vinserti128 $1, %xmm14, %ymm10, %ymm15 +vmovdqa %ymm15, 2112(%r9) +vpermq $78, %ymm4, %ymm4 +vpermq $78, %ymm5, %ymm5 +vpermq $78, %ymm6, %ymm6 +vpermq $78, %ymm7, %ymm7 +vpermq $78, %ymm8, %ymm8 +vpermq $78, %ymm9, %ymm9 +vpermq $78, %ymm10, %ymm10 +vinserti128 $0, %xmm4, %ymm3, %ymm15 +vmovdqa %ymm15, 2176(%r9) +vinserti128 $0, %xmm5, %ymm13, %ymm15 +vmovdqa %ymm15, 2208(%r9) +vinserti128 $0, %xmm6, %ymm11, %ymm15 +vmovdqa %ymm15, 2240(%r9) +vinserti128 $0, %xmm7, %ymm0, %ymm15 +vmovdqa %ymm15, 2272(%r9) +vinserti128 $0, %xmm8, %ymm12, %ymm15 +vmovdqa %ymm15, 2304(%r9) +vinserti128 $0, %xmm9, %ymm2, %ymm15 +vmovdqa %ymm15, 2336(%r9) +vinserti128 $0, %xmm10, %ymm14, %ymm15 +vmovdqa %ymm15, 2368(%r9) +vmovdqa 0(%rsp), %ymm11 +vinserti128 $1, %xmm1, %ymm11, %ymm14 +vmovdqa %ymm14, 2144(%r9) +vpermq $78, %ymm11, %ymm11 +vinserti128 $0, %xmm11, %ymm1, %ymm1 +vmovdqa %ymm1, 2400(%r9) +vmovdqa 64(%r11), %ymm0 +vmovdqa 256(%r11), %ymm1 +vmovdqa 448(%r11), %ymm2 +vmovdqa 640(%r11), %ymm3 +vpunpcklwd 160(%r11), %ymm0, %ymm4 +vpunpckhwd 160(%r11), %ymm0, %ymm5 +vpunpcklwd 352(%r11), %ymm1, %ymm6 +vpunpckhwd 352(%r11), %ymm1, %ymm7 +vpunpcklwd 544(%r11), %ymm2, %ymm8 +vpunpckhwd 544(%r11), %ymm2, %ymm9 +vpunpcklwd 736(%r11), %ymm3, %ymm10 +vpunpckhwd 736(%r11), %ymm3, %ymm11 +vpunpckldq %ymm6, %ymm4, %ymm0 +vpunpckhdq %ymm6, %ymm4, %ymm1 +vpunpckldq %ymm7, %ymm5, %ymm2 +vpunpckhdq %ymm7, %ymm5, %ymm3 +vpunpckldq %ymm10, %ymm8, %ymm12 +vpunpckhdq %ymm10, %ymm8, %ymm13 +vpunpckldq %ymm11, %ymm9, %ymm14 +vpunpckhdq %ymm11, %ymm9, %ymm15 +vpunpcklqdq %ymm12, %ymm0, %ymm4 +vpunpckhqdq %ymm12, %ymm0, %ymm5 +vpunpcklqdq %ymm13, %ymm1, %ymm6 +vpunpckhqdq %ymm13, %ymm1, %ymm7 +vpunpcklqdq %ymm14, %ymm2, %ymm8 +vpunpckhqdq %ymm14, %ymm2, %ymm9 +vpunpcklqdq %ymm15, %ymm3, %ymm10 +vpunpckhqdq %ymm15, %ymm3, %ymm11 +vmovdqa 832(%r11), %ymm0 +vmovdqa 1024(%r11), %ymm1 +vmovdqa 1216(%r11), %ymm2 +vmovdqa 1408(%r11), %ymm3 +vpunpcklwd 928(%r11), %ymm0, %ymm12 +vpunpckhwd 928(%r11), %ymm0, %ymm13 +vpunpcklwd 1120(%r11), %ymm1, %ymm14 +vpunpckhwd 1120(%r11), %ymm1, %ymm15 +vpunpcklwd 1312(%r11), %ymm2, %ymm0 +vpunpckhwd 1312(%r11), %ymm2, %ymm1 +vpunpcklwd 1504(%r11), %ymm3, %ymm2 +vpunpckhwd 1504(%r11), %ymm3, %ymm3 +vmovdqa %ymm11, 0(%rsp) +vpunpckldq %ymm14, %ymm12, %ymm11 +vpunpckhdq %ymm14, %ymm12, %ymm12 +vpunpckldq %ymm15, %ymm13, %ymm14 +vpunpckhdq %ymm15, %ymm13, %ymm15 +vpunpckldq %ymm2, %ymm0, %ymm13 +vpunpckhdq %ymm2, %ymm0, %ymm0 +vpunpckldq %ymm3, %ymm1, %ymm2 +vpunpckhdq %ymm3, %ymm1, %ymm1 +vpunpcklqdq %ymm13, %ymm11, %ymm3 +vpunpckhqdq %ymm13, %ymm11, %ymm13 +vpunpcklqdq %ymm0, %ymm12, %ymm11 +vpunpckhqdq %ymm0, %ymm12, %ymm0 +vpunpcklqdq %ymm2, %ymm14, %ymm12 +vpunpckhqdq %ymm2, %ymm14, %ymm2 +vpunpcklqdq %ymm1, %ymm15, %ymm14 +vpunpckhqdq %ymm1, %ymm15, %ymm1 +vinserti128 $1, %xmm3, %ymm4, %ymm15 +vmovdqa %ymm15, 2432(%r9) +vinserti128 $1, %xmm13, %ymm5, %ymm15 +vmovdqa %ymm15, 2464(%r9) +vinserti128 $1, %xmm11, %ymm6, %ymm15 +vmovdqa %ymm15, 2496(%r9) +vinserti128 $1, %xmm0, %ymm7, %ymm15 +vmovdqa %ymm15, 2528(%r9) +vinserti128 $1, %xmm12, %ymm8, %ymm15 +vmovdqa %ymm15, 2560(%r9) +vinserti128 $1, %xmm2, %ymm9, %ymm15 +vmovdqa %ymm15, 2592(%r9) +vinserti128 $1, %xmm14, %ymm10, %ymm15 +vmovdqa %ymm15, 2624(%r9) +vpermq $78, %ymm4, %ymm4 +vpermq $78, %ymm5, %ymm5 +vpermq $78, %ymm6, %ymm6 +vpermq $78, %ymm7, %ymm7 +vpermq $78, %ymm8, %ymm8 +vpermq $78, %ymm9, %ymm9 +vpermq $78, %ymm10, %ymm10 +vinserti128 $0, %xmm4, %ymm3, %ymm15 +vmovdqa %ymm15, 2688(%r9) +vinserti128 $0, %xmm5, %ymm13, %ymm15 +vmovdqa %ymm15, 2720(%r9) +vinserti128 $0, %xmm6, %ymm11, %ymm15 +vmovdqa %ymm15, 2752(%r9) +vinserti128 $0, %xmm7, %ymm0, %ymm15 +vmovdqa %ymm15, 2784(%r9) +vmovdqa 0(%rsp), %ymm11 +vinserti128 $1, %xmm1, %ymm11, %ymm14 +vmovdqa %ymm14, 2656(%r9) +addq $32, %rsp +innerloop_4eced63f144beffcb0247f9c6f67d165: +vmovdqa 0(%r9), %ymm0 +vmovdqa 1408(%r9), %ymm6 +vmovdqa 32(%r9), %ymm1 +vmovdqa 1440(%r9), %ymm7 +vmovdqa 64(%r9), %ymm2 +vmovdqa 1472(%r9), %ymm8 +vmovdqa 96(%r9), %ymm3 +vmovdqa 1504(%r9), %ymm9 +vmovdqa 128(%r9), %ymm4 +vmovdqa 1536(%r9), %ymm10 +vmovdqa 160(%r9), %ymm5 +vmovdqa 1568(%r9), %ymm11 +vpmullw %ymm0, %ymm6, %ymm12 +vmovdqa %ymm12, 2816(%r10) +vpmullw %ymm0, %ymm7, %ymm13 +vpmullw %ymm1, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 2848(%r10) +vpmullw %ymm0, %ymm8, %ymm12 +vpmullw %ymm1, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 2880(%r10) +vpmullw %ymm0, %ymm9, %ymm13 +vpmullw %ymm1, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm2, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 2912(%r10) +vpmullw %ymm0, %ymm10, %ymm12 +vpmullw %ymm1, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm3, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 2944(%r10) +vpmullw %ymm0, %ymm11, %ymm13 +vpmullw %ymm1, %ymm10, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm2, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm4, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm5, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 2976(%r10) +vpmullw %ymm1, %ymm11, %ymm12 +vpmullw %ymm2, %ymm10, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm3, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm5, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 3008(%r10) +vpmullw %ymm2, %ymm11, %ymm13 +vpmullw %ymm3, %ymm10, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm4, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm5, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 3040(%r10) +vpmullw %ymm3, %ymm11, %ymm12 +vpmullw %ymm4, %ymm10, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm5, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 3072(%r10) +vpmullw %ymm4, %ymm11, %ymm13 +vpmullw %ymm5, %ymm10, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 3104(%r10) +vpmullw %ymm5, %ymm11, %ymm12 +vmovdqa %ymm12, 3136(%r10) +vmovdqa 192(%r9), %ymm0 +vmovdqa 1600(%r9), %ymm6 +vmovdqa 224(%r9), %ymm1 +vmovdqa 1632(%r9), %ymm7 +vmovdqa 256(%r9), %ymm2 +vmovdqa 1664(%r9), %ymm8 +vmovdqa 288(%r9), %ymm3 +vmovdqa 1696(%r9), %ymm9 +vmovdqa 320(%r9), %ymm4 +vmovdqa 1728(%r9), %ymm10 +vpmullw %ymm0, %ymm6, %ymm12 +vmovdqa %ymm12, 3200(%r10) +vpmullw %ymm0, %ymm7, %ymm13 +vpmullw %ymm1, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 3232(%r10) +vpmullw %ymm0, %ymm8, %ymm12 +vpmullw %ymm1, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 3264(%r10) +vpmullw %ymm0, %ymm9, %ymm13 +vpmullw %ymm1, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm2, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 3296(%r10) +vpmullw %ymm0, %ymm10, %ymm12 +vpmullw %ymm1, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm3, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 3328(%r10) +vpmullw %ymm1, %ymm10, %ymm13 +vpmullw %ymm2, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm4, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 3360(%r10) +vpmullw %ymm2, %ymm10, %ymm12 +vpmullw %ymm3, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 3392(%r10) +vpmullw %ymm3, %ymm10, %ymm13 +vpmullw %ymm4, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 3424(%r10) +vpmullw %ymm4, %ymm10, %ymm12 +vmovdqa %ymm12, 3456(%r10) +vpaddw 0(%r9), %ymm0, %ymm0 +vpaddw 1408(%r9), %ymm6, %ymm6 +vpaddw 32(%r9), %ymm1, %ymm1 +vpaddw 1440(%r9), %ymm7, %ymm7 +vpaddw 64(%r9), %ymm2, %ymm2 +vpaddw 1472(%r9), %ymm8, %ymm8 +vpaddw 96(%r9), %ymm3, %ymm3 +vpaddw 1504(%r9), %ymm9, %ymm9 +vpaddw 128(%r9), %ymm4, %ymm4 +vpaddw 1536(%r9), %ymm10, %ymm10 +vpmullw %ymm0, %ymm11, %ymm12 +vpmullw %ymm1, %ymm10, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm2, %ymm9, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm3, %ymm8, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm4, %ymm7, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm5, %ymm6, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpsubw 2976(%r10), %ymm12, %ymm12 +vpsubw 3360(%r10), %ymm12, %ymm12 +vmovdqa %ymm12, 3168(%r10) +vpmullw %ymm5, %ymm7, %ymm12 +vpmullw %ymm5, %ymm8, %ymm13 +vpmullw %ymm5, %ymm9, %ymm14 +vpmullw %ymm5, %ymm10, %ymm15 +vpmullw %ymm1, %ymm11, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm2, %ymm10, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm3, %ymm9, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm4, %ymm8, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm2, %ymm11, %ymm5 +vpaddw %ymm5, %ymm13, %ymm13 +vpmullw %ymm3, %ymm10, %ymm5 +vpaddw %ymm5, %ymm13, %ymm13 +vpmullw %ymm4, %ymm9, %ymm5 +vpaddw %ymm5, %ymm13, %ymm13 +vpmullw %ymm3, %ymm11, %ymm5 +vpaddw %ymm5, %ymm14, %ymm14 +vpmullw %ymm4, %ymm10, %ymm5 +vpaddw %ymm5, %ymm14, %ymm14 +vpmullw %ymm4, %ymm11, %ymm5 +vpaddw %ymm5, %ymm15, %ymm15 +vpmullw %ymm0, %ymm10, %ymm11 +vpmullw %ymm1, %ymm9, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm2, %ymm8, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm3, %ymm7, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm4, %ymm6, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm0, %ymm9, %ymm10 +vpmullw %ymm1, %ymm8, %ymm5 +vpaddw %ymm5, %ymm10, %ymm10 +vpmullw %ymm2, %ymm7, %ymm5 +vpaddw %ymm5, %ymm10, %ymm10 +vpmullw %ymm3, %ymm6, %ymm5 +vpaddw %ymm5, %ymm10, %ymm10 +vpmullw %ymm0, %ymm8, %ymm9 +vpmullw %ymm1, %ymm7, %ymm5 +vpaddw %ymm5, %ymm9, %ymm9 +vpmullw %ymm2, %ymm6, %ymm5 +vpaddw %ymm5, %ymm9, %ymm9 +vpmullw %ymm0, %ymm7, %ymm8 +vpmullw %ymm1, %ymm6, %ymm5 +vpaddw %ymm5, %ymm8, %ymm8 +vpmullw %ymm0, %ymm6, %ymm7 +vmovdqa 3008(%r10), %ymm0 +vpsubw 3200(%r10), %ymm0, %ymm0 +vpsubw %ymm0, %ymm12, %ymm6 +vpsubw 3392(%r10), %ymm6, %ymm6 +vmovdqa %ymm6, 3200(%r10) +vpaddw %ymm7, %ymm0, %ymm0 +vpsubw 2816(%r10), %ymm0, %ymm0 +vmovdqa %ymm0, 3008(%r10) +vmovdqa 3040(%r10), %ymm1 +vpsubw 3232(%r10), %ymm1, %ymm1 +vpsubw %ymm1, %ymm13, %ymm7 +vpsubw 3424(%r10), %ymm7, %ymm7 +vmovdqa %ymm7, 3232(%r10) +vpaddw %ymm8, %ymm1, %ymm1 +vpsubw 2848(%r10), %ymm1, %ymm1 +vmovdqa %ymm1, 3040(%r10) +vmovdqa 3072(%r10), %ymm2 +vpsubw 3264(%r10), %ymm2, %ymm2 +vpsubw %ymm2, %ymm14, %ymm8 +vpsubw 3456(%r10), %ymm8, %ymm8 +vmovdqa %ymm8, 3264(%r10) +vpaddw %ymm9, %ymm2, %ymm2 +vpsubw 2880(%r10), %ymm2, %ymm2 +vmovdqa %ymm2, 3072(%r10) +vmovdqa 3104(%r10), %ymm3 +vpsubw 3296(%r10), %ymm3, %ymm3 +vpsubw %ymm3, %ymm15, %ymm9 +vmovdqa %ymm9, 3296(%r10) +vpaddw %ymm10, %ymm3, %ymm3 +vpsubw 2912(%r10), %ymm3, %ymm3 +vmovdqa %ymm3, 3104(%r10) +vmovdqa 3136(%r10), %ymm4 +vpsubw 3328(%r10), %ymm4, %ymm4 +vpaddw %ymm11, %ymm4, %ymm4 +vpsubw 2944(%r10), %ymm4, %ymm4 +vmovdqa %ymm4, 3136(%r10) +vmovdqa 352(%r9), %ymm0 +vmovdqa 1760(%r9), %ymm6 +vmovdqa 384(%r9), %ymm1 +vmovdqa 1792(%r9), %ymm7 +vmovdqa 416(%r9), %ymm2 +vmovdqa 1824(%r9), %ymm8 +vmovdqa 448(%r9), %ymm3 +vmovdqa 1856(%r9), %ymm9 +vmovdqa 480(%r9), %ymm4 +vmovdqa 1888(%r9), %ymm10 +vmovdqa 512(%r9), %ymm5 +vmovdqa 1920(%r9), %ymm11 +vpmullw %ymm0, %ymm6, %ymm12 +vmovdqa %ymm12, 3520(%r10) +vpmullw %ymm0, %ymm7, %ymm13 +vpmullw %ymm1, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 3552(%r10) +vpmullw %ymm0, %ymm8, %ymm12 +vpmullw %ymm1, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 3584(%r10) +vpmullw %ymm0, %ymm9, %ymm13 +vpmullw %ymm1, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm2, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 3616(%r10) +vpmullw %ymm0, %ymm10, %ymm12 +vpmullw %ymm1, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm3, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 3648(%r10) +vpmullw %ymm0, %ymm11, %ymm13 +vpmullw %ymm1, %ymm10, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm2, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm4, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm5, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 3680(%r10) +vpmullw %ymm1, %ymm11, %ymm12 +vpmullw %ymm2, %ymm10, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm3, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm5, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 3712(%r10) +vpmullw %ymm2, %ymm11, %ymm13 +vpmullw %ymm3, %ymm10, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm4, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm5, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 3744(%r10) +vpmullw %ymm3, %ymm11, %ymm12 +vpmullw %ymm4, %ymm10, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm5, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 3776(%r10) +vpmullw %ymm4, %ymm11, %ymm13 +vpmullw %ymm5, %ymm10, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 3808(%r10) +vpmullw %ymm5, %ymm11, %ymm12 +vmovdqa %ymm12, 3840(%r10) +vmovdqa 544(%r9), %ymm0 +vmovdqa 1952(%r9), %ymm6 +vmovdqa 576(%r9), %ymm1 +vmovdqa 1984(%r9), %ymm7 +vmovdqa 608(%r9), %ymm2 +vmovdqa 2016(%r9), %ymm8 +vmovdqa 640(%r9), %ymm3 +vmovdqa 2048(%r9), %ymm9 +vmovdqa 672(%r9), %ymm4 +vmovdqa 2080(%r9), %ymm10 +vpmullw %ymm0, %ymm6, %ymm12 +vmovdqa %ymm12, 3904(%r10) +vpmullw %ymm0, %ymm7, %ymm13 +vpmullw %ymm1, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 3936(%r10) +vpmullw %ymm0, %ymm8, %ymm12 +vpmullw %ymm1, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 3968(%r10) +vpmullw %ymm0, %ymm9, %ymm13 +vpmullw %ymm1, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm2, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 4000(%r10) +vpmullw %ymm0, %ymm10, %ymm12 +vpmullw %ymm1, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm3, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 4032(%r10) +vpmullw %ymm1, %ymm10, %ymm13 +vpmullw %ymm2, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm4, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 4064(%r10) +vpmullw %ymm2, %ymm10, %ymm12 +vpmullw %ymm3, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 4096(%r10) +vpmullw %ymm3, %ymm10, %ymm13 +vpmullw %ymm4, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 4128(%r10) +vpmullw %ymm4, %ymm10, %ymm12 +vmovdqa %ymm12, 4160(%r10) +vpaddw 352(%r9), %ymm0, %ymm0 +vpaddw 1760(%r9), %ymm6, %ymm6 +vpaddw 384(%r9), %ymm1, %ymm1 +vpaddw 1792(%r9), %ymm7, %ymm7 +vpaddw 416(%r9), %ymm2, %ymm2 +vpaddw 1824(%r9), %ymm8, %ymm8 +vpaddw 448(%r9), %ymm3, %ymm3 +vpaddw 1856(%r9), %ymm9, %ymm9 +vpaddw 480(%r9), %ymm4, %ymm4 +vpaddw 1888(%r9), %ymm10, %ymm10 +vpmullw %ymm0, %ymm11, %ymm12 +vpmullw %ymm1, %ymm10, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm2, %ymm9, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm3, %ymm8, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm4, %ymm7, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm5, %ymm6, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpsubw 3680(%r10), %ymm12, %ymm12 +vpsubw 4064(%r10), %ymm12, %ymm12 +vmovdqa %ymm12, 3872(%r10) +vpmullw %ymm5, %ymm7, %ymm12 +vpmullw %ymm5, %ymm8, %ymm13 +vpmullw %ymm5, %ymm9, %ymm14 +vpmullw %ymm5, %ymm10, %ymm15 +vpmullw %ymm1, %ymm11, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm2, %ymm10, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm3, %ymm9, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm4, %ymm8, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm2, %ymm11, %ymm5 +vpaddw %ymm5, %ymm13, %ymm13 +vpmullw %ymm3, %ymm10, %ymm5 +vpaddw %ymm5, %ymm13, %ymm13 +vpmullw %ymm4, %ymm9, %ymm5 +vpaddw %ymm5, %ymm13, %ymm13 +vpmullw %ymm3, %ymm11, %ymm5 +vpaddw %ymm5, %ymm14, %ymm14 +vpmullw %ymm4, %ymm10, %ymm5 +vpaddw %ymm5, %ymm14, %ymm14 +vpmullw %ymm4, %ymm11, %ymm5 +vpaddw %ymm5, %ymm15, %ymm15 +vpmullw %ymm0, %ymm10, %ymm11 +vpmullw %ymm1, %ymm9, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm2, %ymm8, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm3, %ymm7, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm4, %ymm6, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm0, %ymm9, %ymm10 +vpmullw %ymm1, %ymm8, %ymm5 +vpaddw %ymm5, %ymm10, %ymm10 +vpmullw %ymm2, %ymm7, %ymm5 +vpaddw %ymm5, %ymm10, %ymm10 +vpmullw %ymm3, %ymm6, %ymm5 +vpaddw %ymm5, %ymm10, %ymm10 +vpmullw %ymm0, %ymm8, %ymm9 +vpmullw %ymm1, %ymm7, %ymm5 +vpaddw %ymm5, %ymm9, %ymm9 +vpmullw %ymm2, %ymm6, %ymm5 +vpaddw %ymm5, %ymm9, %ymm9 +vpmullw %ymm0, %ymm7, %ymm8 +vpmullw %ymm1, %ymm6, %ymm5 +vpaddw %ymm5, %ymm8, %ymm8 +vpmullw %ymm0, %ymm6, %ymm7 +vmovdqa 3712(%r10), %ymm0 +vpsubw 3904(%r10), %ymm0, %ymm0 +vpsubw %ymm0, %ymm12, %ymm6 +vpsubw 4096(%r10), %ymm6, %ymm6 +vmovdqa %ymm6, 3904(%r10) +vpaddw %ymm7, %ymm0, %ymm0 +vpsubw 3520(%r10), %ymm0, %ymm0 +vmovdqa %ymm0, 3712(%r10) +vmovdqa 3744(%r10), %ymm1 +vpsubw 3936(%r10), %ymm1, %ymm1 +vpsubw %ymm1, %ymm13, %ymm7 +vpsubw 4128(%r10), %ymm7, %ymm7 +vmovdqa %ymm7, 3936(%r10) +vpaddw %ymm8, %ymm1, %ymm1 +vpsubw 3552(%r10), %ymm1, %ymm1 +vmovdqa %ymm1, 3744(%r10) +vmovdqa 3776(%r10), %ymm2 +vpsubw 3968(%r10), %ymm2, %ymm2 +vpsubw %ymm2, %ymm14, %ymm8 +vpsubw 4160(%r10), %ymm8, %ymm8 +vmovdqa %ymm8, 3968(%r10) +vpaddw %ymm9, %ymm2, %ymm2 +vpsubw 3584(%r10), %ymm2, %ymm2 +vmovdqa %ymm2, 3776(%r10) +vmovdqa 3808(%r10), %ymm3 +vpsubw 4000(%r10), %ymm3, %ymm3 +vpsubw %ymm3, %ymm15, %ymm9 +vmovdqa %ymm9, 4000(%r10) +vpaddw %ymm10, %ymm3, %ymm3 +vpsubw 3616(%r10), %ymm3, %ymm3 +vmovdqa %ymm3, 3808(%r10) +vmovdqa 3840(%r10), %ymm4 +vpsubw 4032(%r10), %ymm4, %ymm4 +vpaddw %ymm11, %ymm4, %ymm4 +vpsubw 3648(%r10), %ymm4, %ymm4 +vmovdqa %ymm4, 3840(%r10) +vmovdqa 0(%r9), %ymm0 +vmovdqa 1408(%r9), %ymm6 +vpaddw 352(%r9), %ymm0, %ymm0 +vpaddw 1760(%r9), %ymm6, %ymm6 +vmovdqa 32(%r9), %ymm1 +vmovdqa 1440(%r9), %ymm7 +vpaddw 384(%r9), %ymm1, %ymm1 +vpaddw 1792(%r9), %ymm7, %ymm7 +vmovdqa 64(%r9), %ymm2 +vmovdqa 1472(%r9), %ymm8 +vpaddw 416(%r9), %ymm2, %ymm2 +vpaddw 1824(%r9), %ymm8, %ymm8 +vmovdqa 96(%r9), %ymm3 +vmovdqa 1504(%r9), %ymm9 +vpaddw 448(%r9), %ymm3, %ymm3 +vpaddw 1856(%r9), %ymm9, %ymm9 +vmovdqa 128(%r9), %ymm4 +vmovdqa 1536(%r9), %ymm10 +vpaddw 480(%r9), %ymm4, %ymm4 +vpaddw 1888(%r9), %ymm10, %ymm10 +vmovdqa 160(%r9), %ymm5 +vmovdqa 1568(%r9), %ymm11 +vpaddw 512(%r9), %ymm5, %ymm5 +vpaddw 1920(%r9), %ymm11, %ymm11 +vpmullw %ymm0, %ymm6, %ymm12 +vmovdqa %ymm12, 5888(%rsp) +vpmullw %ymm0, %ymm7, %ymm13 +vpmullw %ymm1, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 5920(%rsp) +vpmullw %ymm0, %ymm8, %ymm12 +vpmullw %ymm1, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 5952(%rsp) +vpmullw %ymm0, %ymm9, %ymm13 +vpmullw %ymm1, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm2, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 5984(%rsp) +vpmullw %ymm0, %ymm10, %ymm12 +vpmullw %ymm1, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm3, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 6016(%rsp) +vpmullw %ymm0, %ymm11, %ymm13 +vpmullw %ymm1, %ymm10, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm2, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm4, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm5, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 6048(%rsp) +vpmullw %ymm1, %ymm11, %ymm12 +vpmullw %ymm2, %ymm10, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm3, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm5, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 6080(%rsp) +vpmullw %ymm2, %ymm11, %ymm13 +vpmullw %ymm3, %ymm10, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm4, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm5, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 6112(%rsp) +vpmullw %ymm3, %ymm11, %ymm12 +vpmullw %ymm4, %ymm10, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm5, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 6144(%rsp) +vpmullw %ymm4, %ymm11, %ymm13 +vpmullw %ymm5, %ymm10, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 6176(%rsp) +vpmullw %ymm5, %ymm11, %ymm12 +vmovdqa %ymm12, 6208(%rsp) +vmovdqa 192(%r9), %ymm0 +vmovdqa 1600(%r9), %ymm6 +vpaddw 544(%r9), %ymm0, %ymm0 +vpaddw 1952(%r9), %ymm6, %ymm6 +vmovdqa 224(%r9), %ymm1 +vmovdqa 1632(%r9), %ymm7 +vpaddw 576(%r9), %ymm1, %ymm1 +vpaddw 1984(%r9), %ymm7, %ymm7 +vmovdqa 256(%r9), %ymm2 +vmovdqa 1664(%r9), %ymm8 +vpaddw 608(%r9), %ymm2, %ymm2 +vpaddw 2016(%r9), %ymm8, %ymm8 +vmovdqa 288(%r9), %ymm3 +vmovdqa 1696(%r9), %ymm9 +vpaddw 640(%r9), %ymm3, %ymm3 +vpaddw 2048(%r9), %ymm9, %ymm9 +vmovdqa 320(%r9), %ymm4 +vmovdqa 1728(%r9), %ymm10 +vpaddw 672(%r9), %ymm4, %ymm4 +vpaddw 2080(%r9), %ymm10, %ymm10 +vpmullw %ymm0, %ymm6, %ymm12 +vmovdqa %ymm12, 6272(%rsp) +vpmullw %ymm0, %ymm7, %ymm13 +vpmullw %ymm1, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 6304(%rsp) +vpmullw %ymm0, %ymm8, %ymm12 +vpmullw %ymm1, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 6336(%rsp) +vpmullw %ymm0, %ymm9, %ymm13 +vpmullw %ymm1, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm2, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 6368(%rsp) +vpmullw %ymm0, %ymm10, %ymm12 +vpmullw %ymm1, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm3, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 6400(%rsp) +vpmullw %ymm1, %ymm10, %ymm13 +vpmullw %ymm2, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm4, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 6432(%rsp) +vpmullw %ymm2, %ymm10, %ymm12 +vpmullw %ymm3, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 6464(%rsp) +vpmullw %ymm3, %ymm10, %ymm13 +vpmullw %ymm4, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 6496(%rsp) +vpmullw %ymm4, %ymm10, %ymm12 +vmovdqa %ymm12, 6528(%rsp) +vpaddw 0(%r9), %ymm0, %ymm0 +vpaddw 1408(%r9), %ymm6, %ymm6 +vpaddw 352(%r9), %ymm0, %ymm0 +vpaddw 1760(%r9), %ymm6, %ymm6 +vpaddw 32(%r9), %ymm1, %ymm1 +vpaddw 1440(%r9), %ymm7, %ymm7 +vpaddw 384(%r9), %ymm1, %ymm1 +vpaddw 1792(%r9), %ymm7, %ymm7 +vpaddw 64(%r9), %ymm2, %ymm2 +vpaddw 1472(%r9), %ymm8, %ymm8 +vpaddw 416(%r9), %ymm2, %ymm2 +vpaddw 1824(%r9), %ymm8, %ymm8 +vpaddw 96(%r9), %ymm3, %ymm3 +vpaddw 1504(%r9), %ymm9, %ymm9 +vpaddw 448(%r9), %ymm3, %ymm3 +vpaddw 1856(%r9), %ymm9, %ymm9 +vpaddw 128(%r9), %ymm4, %ymm4 +vpaddw 1536(%r9), %ymm10, %ymm10 +vpaddw 480(%r9), %ymm4, %ymm4 +vpaddw 1888(%r9), %ymm10, %ymm10 +vpmullw %ymm0, %ymm11, %ymm12 +vpmullw %ymm1, %ymm10, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm2, %ymm9, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm3, %ymm8, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm4, %ymm7, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm5, %ymm6, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpsubw 6048(%rsp), %ymm12, %ymm12 +vpsubw 6432(%rsp), %ymm12, %ymm12 +vmovdqa %ymm12, 6240(%rsp) +vpmullw %ymm5, %ymm7, %ymm12 +vpmullw %ymm5, %ymm8, %ymm13 +vpmullw %ymm5, %ymm9, %ymm14 +vpmullw %ymm5, %ymm10, %ymm15 +vpmullw %ymm1, %ymm11, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm2, %ymm10, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm3, %ymm9, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm4, %ymm8, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm2, %ymm11, %ymm5 +vpaddw %ymm5, %ymm13, %ymm13 +vpmullw %ymm3, %ymm10, %ymm5 +vpaddw %ymm5, %ymm13, %ymm13 +vpmullw %ymm4, %ymm9, %ymm5 +vpaddw %ymm5, %ymm13, %ymm13 +vpmullw %ymm3, %ymm11, %ymm5 +vpaddw %ymm5, %ymm14, %ymm14 +vpmullw %ymm4, %ymm10, %ymm5 +vpaddw %ymm5, %ymm14, %ymm14 +vpmullw %ymm4, %ymm11, %ymm5 +vpaddw %ymm5, %ymm15, %ymm15 +vpmullw %ymm0, %ymm10, %ymm11 +vpmullw %ymm1, %ymm9, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm2, %ymm8, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm3, %ymm7, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm4, %ymm6, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm0, %ymm9, %ymm10 +vpmullw %ymm1, %ymm8, %ymm5 +vpaddw %ymm5, %ymm10, %ymm10 +vpmullw %ymm2, %ymm7, %ymm5 +vpaddw %ymm5, %ymm10, %ymm10 +vpmullw %ymm3, %ymm6, %ymm5 +vpaddw %ymm5, %ymm10, %ymm10 +vpmullw %ymm0, %ymm8, %ymm9 +vpmullw %ymm1, %ymm7, %ymm5 +vpaddw %ymm5, %ymm9, %ymm9 +vpmullw %ymm2, %ymm6, %ymm5 +vpaddw %ymm5, %ymm9, %ymm9 +vpmullw %ymm0, %ymm7, %ymm8 +vpmullw %ymm1, %ymm6, %ymm5 +vpaddw %ymm5, %ymm8, %ymm8 +vpmullw %ymm0, %ymm6, %ymm7 +vmovdqa 6080(%rsp), %ymm0 +vpsubw 6272(%rsp), %ymm0, %ymm0 +vpsubw %ymm0, %ymm12, %ymm6 +vpsubw 6464(%rsp), %ymm6, %ymm6 +vmovdqa %ymm6, 6272(%rsp) +vpaddw %ymm7, %ymm0, %ymm0 +vpsubw 5888(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 6080(%rsp) +vmovdqa 6112(%rsp), %ymm1 +vpsubw 6304(%rsp), %ymm1, %ymm1 +vpsubw %ymm1, %ymm13, %ymm7 +vpsubw 6496(%rsp), %ymm7, %ymm7 +vmovdqa %ymm7, 6304(%rsp) +vpaddw %ymm8, %ymm1, %ymm1 +vpsubw 5920(%rsp), %ymm1, %ymm1 +vmovdqa %ymm1, 6112(%rsp) +vmovdqa 6144(%rsp), %ymm2 +vpsubw 6336(%rsp), %ymm2, %ymm2 +vpsubw %ymm2, %ymm14, %ymm8 +vpsubw 6528(%rsp), %ymm8, %ymm8 +vmovdqa %ymm8, 6336(%rsp) +vpaddw %ymm9, %ymm2, %ymm2 +vpsubw 5952(%rsp), %ymm2, %ymm2 +vmovdqa %ymm2, 6144(%rsp) +vmovdqa 6176(%rsp), %ymm3 +vpsubw 6368(%rsp), %ymm3, %ymm3 +vpsubw %ymm3, %ymm15, %ymm9 +vmovdqa %ymm9, 6368(%rsp) +vpaddw %ymm10, %ymm3, %ymm3 +vpsubw 5984(%rsp), %ymm3, %ymm3 +vmovdqa %ymm3, 6176(%rsp) +vmovdqa 6208(%rsp), %ymm4 +vpsubw 6400(%rsp), %ymm4, %ymm4 +vpaddw %ymm11, %ymm4, %ymm4 +vpsubw 6016(%rsp), %ymm4, %ymm4 +vmovdqa %ymm4, 6208(%rsp) +vmovdqa 6208(%rsp), %ymm0 +vpsubw 3136(%r10), %ymm0, %ymm0 +vpsubw 3840(%r10), %ymm0, %ymm0 +vmovdqa %ymm0, 3488(%r10) +vmovdqa 3168(%r10), %ymm0 +vpsubw 3520(%r10), %ymm0, %ymm0 +vmovdqa 6240(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 3872(%r10), %ymm1, %ymm1 +vpsubw 2816(%r10), %ymm0, %ymm0 +vpaddw 5888(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3168(%r10) +vmovdqa %ymm1, 3520(%r10) +vmovdqa 3200(%r10), %ymm0 +vpsubw 3552(%r10), %ymm0, %ymm0 +vmovdqa 6272(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 3904(%r10), %ymm1, %ymm1 +vpsubw 2848(%r10), %ymm0, %ymm0 +vpaddw 5920(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3200(%r10) +vmovdqa %ymm1, 3552(%r10) +vmovdqa 3232(%r10), %ymm0 +vpsubw 3584(%r10), %ymm0, %ymm0 +vmovdqa 6304(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 3936(%r10), %ymm1, %ymm1 +vpsubw 2880(%r10), %ymm0, %ymm0 +vpaddw 5952(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3232(%r10) +vmovdqa %ymm1, 3584(%r10) +vmovdqa 3264(%r10), %ymm0 +vpsubw 3616(%r10), %ymm0, %ymm0 +vmovdqa 6336(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 3968(%r10), %ymm1, %ymm1 +vpsubw 2912(%r10), %ymm0, %ymm0 +vpaddw 5984(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3264(%r10) +vmovdqa %ymm1, 3616(%r10) +vmovdqa 3296(%r10), %ymm0 +vpsubw 3648(%r10), %ymm0, %ymm0 +vmovdqa 6368(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 4000(%r10), %ymm1, %ymm1 +vpsubw 2944(%r10), %ymm0, %ymm0 +vpaddw 6016(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3296(%r10) +vmovdqa %ymm1, 3648(%r10) +vmovdqa 3328(%r10), %ymm0 +vpsubw 3680(%r10), %ymm0, %ymm0 +vmovdqa 6400(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 4032(%r10), %ymm1, %ymm1 +vpsubw 2976(%r10), %ymm0, %ymm0 +vpaddw 6048(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3328(%r10) +vmovdqa %ymm1, 3680(%r10) +vmovdqa 3360(%r10), %ymm0 +vpsubw 3712(%r10), %ymm0, %ymm0 +vmovdqa 6432(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 4064(%r10), %ymm1, %ymm1 +vpsubw 3008(%r10), %ymm0, %ymm0 +vpaddw 6080(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3360(%r10) +vmovdqa %ymm1, 3712(%r10) +vmovdqa 3392(%r10), %ymm0 +vpsubw 3744(%r10), %ymm0, %ymm0 +vmovdqa 6464(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 4096(%r10), %ymm1, %ymm1 +vpsubw 3040(%r10), %ymm0, %ymm0 +vpaddw 6112(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3392(%r10) +vmovdqa %ymm1, 3744(%r10) +vmovdqa 3424(%r10), %ymm0 +vpsubw 3776(%r10), %ymm0, %ymm0 +vmovdqa 6496(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 4128(%r10), %ymm1, %ymm1 +vpsubw 3072(%r10), %ymm0, %ymm0 +vpaddw 6144(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3424(%r10) +vmovdqa %ymm1, 3776(%r10) +vmovdqa 3456(%r10), %ymm0 +vpsubw 3808(%r10), %ymm0, %ymm0 +vmovdqa 6528(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 4160(%r10), %ymm1, %ymm1 +vpsubw 3104(%r10), %ymm0, %ymm0 +vpaddw 6176(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3456(%r10) +vmovdqa %ymm1, 3808(%r10) +neg %ecx +jns done_4eced63f144beffcb0247f9c6f67d165 +add $704, %r9 +add $1408, %r10 +jmp innerloop_4eced63f144beffcb0247f9c6f67d165 +done_4eced63f144beffcb0247f9c6f67d165: +sub $704, %r9 +sub $1408, %r10 +vmovdqa 0(%r9), %ymm0 +vpaddw 704(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 6592(%rsp) +vmovdqa 1408(%r9), %ymm0 +vpaddw 2112(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7296(%rsp) +vmovdqa 32(%r9), %ymm0 +vpaddw 736(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 6624(%rsp) +vmovdqa 1440(%r9), %ymm0 +vpaddw 2144(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7328(%rsp) +vmovdqa 64(%r9), %ymm0 +vpaddw 768(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 6656(%rsp) +vmovdqa 1472(%r9), %ymm0 +vpaddw 2176(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7360(%rsp) +vmovdqa 96(%r9), %ymm0 +vpaddw 800(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 6688(%rsp) +vmovdqa 1504(%r9), %ymm0 +vpaddw 2208(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7392(%rsp) +vmovdqa 128(%r9), %ymm0 +vpaddw 832(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 6720(%rsp) +vmovdqa 1536(%r9), %ymm0 +vpaddw 2240(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7424(%rsp) +vmovdqa 160(%r9), %ymm0 +vpaddw 864(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 6752(%rsp) +vmovdqa 1568(%r9), %ymm0 +vpaddw 2272(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7456(%rsp) +vmovdqa 192(%r9), %ymm0 +vpaddw 896(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 6784(%rsp) +vmovdqa 1600(%r9), %ymm0 +vpaddw 2304(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7488(%rsp) +vmovdqa 224(%r9), %ymm0 +vpaddw 928(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 6816(%rsp) +vmovdqa 1632(%r9), %ymm0 +vpaddw 2336(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7520(%rsp) +vmovdqa 256(%r9), %ymm0 +vpaddw 960(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 6848(%rsp) +vmovdqa 1664(%r9), %ymm0 +vpaddw 2368(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7552(%rsp) +vmovdqa 288(%r9), %ymm0 +vpaddw 992(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 6880(%rsp) +vmovdqa 1696(%r9), %ymm0 +vpaddw 2400(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7584(%rsp) +vmovdqa 320(%r9), %ymm0 +vpaddw 1024(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 6912(%rsp) +vmovdqa 1728(%r9), %ymm0 +vpaddw 2432(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7616(%rsp) +vmovdqa 352(%r9), %ymm0 +vpaddw 1056(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 6944(%rsp) +vmovdqa 1760(%r9), %ymm0 +vpaddw 2464(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7648(%rsp) +vmovdqa 384(%r9), %ymm0 +vpaddw 1088(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 6976(%rsp) +vmovdqa 1792(%r9), %ymm0 +vpaddw 2496(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7680(%rsp) +vmovdqa 416(%r9), %ymm0 +vpaddw 1120(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7008(%rsp) +vmovdqa 1824(%r9), %ymm0 +vpaddw 2528(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7712(%rsp) +vmovdqa 448(%r9), %ymm0 +vpaddw 1152(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7040(%rsp) +vmovdqa 1856(%r9), %ymm0 +vpaddw 2560(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7744(%rsp) +vmovdqa 480(%r9), %ymm0 +vpaddw 1184(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7072(%rsp) +vmovdqa 1888(%r9), %ymm0 +vpaddw 2592(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7776(%rsp) +vmovdqa 512(%r9), %ymm0 +vpaddw 1216(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7104(%rsp) +vmovdqa 1920(%r9), %ymm0 +vpaddw 2624(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7808(%rsp) +vmovdqa 544(%r9), %ymm0 +vpaddw 1248(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7136(%rsp) +vmovdqa 1952(%r9), %ymm0 +vpaddw 2656(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7840(%rsp) +vmovdqa 576(%r9), %ymm0 +vpaddw 1280(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7168(%rsp) +vmovdqa 1984(%r9), %ymm0 +vpaddw 2688(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7872(%rsp) +vmovdqa 608(%r9), %ymm0 +vpaddw 1312(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7200(%rsp) +vmovdqa 2016(%r9), %ymm0 +vpaddw 2720(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7904(%rsp) +vmovdqa 640(%r9), %ymm0 +vpaddw 1344(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7232(%rsp) +vmovdqa 2048(%r9), %ymm0 +vpaddw 2752(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7936(%rsp) +vmovdqa 672(%r9), %ymm0 +vpaddw 1376(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7264(%rsp) +vmovdqa 2080(%r9), %ymm0 +vpaddw 2784(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7968(%rsp) +vmovdqa 6592(%rsp), %ymm0 +vmovdqa 7296(%rsp), %ymm6 +vmovdqa 6624(%rsp), %ymm1 +vmovdqa 7328(%rsp), %ymm7 +vmovdqa 6656(%rsp), %ymm2 +vmovdqa 7360(%rsp), %ymm8 +vmovdqa 6688(%rsp), %ymm3 +vmovdqa 7392(%rsp), %ymm9 +vmovdqa 6720(%rsp), %ymm4 +vmovdqa 7424(%rsp), %ymm10 +vmovdqa 6752(%rsp), %ymm5 +vmovdqa 7456(%rsp), %ymm11 +vpmullw %ymm0, %ymm6, %ymm12 +vmovdqa %ymm12, 8000(%rsp) +vpmullw %ymm0, %ymm7, %ymm13 +vpmullw %ymm1, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 8032(%rsp) +vpmullw %ymm0, %ymm8, %ymm12 +vpmullw %ymm1, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 8064(%rsp) +vpmullw %ymm0, %ymm9, %ymm13 +vpmullw %ymm1, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm2, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 8096(%rsp) +vpmullw %ymm0, %ymm10, %ymm12 +vpmullw %ymm1, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm3, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 8128(%rsp) +vpmullw %ymm0, %ymm11, %ymm13 +vpmullw %ymm1, %ymm10, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm2, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm4, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm5, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 8160(%rsp) +vpmullw %ymm1, %ymm11, %ymm12 +vpmullw %ymm2, %ymm10, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm3, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm5, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 8192(%rsp) +vpmullw %ymm2, %ymm11, %ymm13 +vpmullw %ymm3, %ymm10, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm4, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm5, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 8224(%rsp) +vpmullw %ymm3, %ymm11, %ymm12 +vpmullw %ymm4, %ymm10, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm5, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 8256(%rsp) +vpmullw %ymm4, %ymm11, %ymm13 +vpmullw %ymm5, %ymm10, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 8288(%rsp) +vpmullw %ymm5, %ymm11, %ymm12 +vmovdqa %ymm12, 8320(%rsp) +vmovdqa 6784(%rsp), %ymm0 +vmovdqa 7488(%rsp), %ymm6 +vmovdqa 6816(%rsp), %ymm1 +vmovdqa 7520(%rsp), %ymm7 +vmovdqa 6848(%rsp), %ymm2 +vmovdqa 7552(%rsp), %ymm8 +vmovdqa 6880(%rsp), %ymm3 +vmovdqa 7584(%rsp), %ymm9 +vmovdqa 6912(%rsp), %ymm4 +vmovdqa 7616(%rsp), %ymm10 +vpmullw %ymm0, %ymm6, %ymm12 +vmovdqa %ymm12, 8384(%rsp) +vpmullw %ymm0, %ymm7, %ymm13 +vpmullw %ymm1, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 8416(%rsp) +vpmullw %ymm0, %ymm8, %ymm12 +vpmullw %ymm1, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 8448(%rsp) +vpmullw %ymm0, %ymm9, %ymm13 +vpmullw %ymm1, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm2, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 8480(%rsp) +vpmullw %ymm0, %ymm10, %ymm12 +vpmullw %ymm1, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm3, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 8512(%rsp) +vpmullw %ymm1, %ymm10, %ymm13 +vpmullw %ymm2, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm4, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 8544(%rsp) +vpmullw %ymm2, %ymm10, %ymm12 +vpmullw %ymm3, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 8576(%rsp) +vpmullw %ymm3, %ymm10, %ymm13 +vpmullw %ymm4, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 8608(%rsp) +vpmullw %ymm4, %ymm10, %ymm12 +vmovdqa %ymm12, 8640(%rsp) +vpaddw 6592(%rsp), %ymm0, %ymm0 +vpaddw 7296(%rsp), %ymm6, %ymm6 +vpaddw 6624(%rsp), %ymm1, %ymm1 +vpaddw 7328(%rsp), %ymm7, %ymm7 +vpaddw 6656(%rsp), %ymm2, %ymm2 +vpaddw 7360(%rsp), %ymm8, %ymm8 +vpaddw 6688(%rsp), %ymm3, %ymm3 +vpaddw 7392(%rsp), %ymm9, %ymm9 +vpaddw 6720(%rsp), %ymm4, %ymm4 +vpaddw 7424(%rsp), %ymm10, %ymm10 +vpmullw %ymm0, %ymm11, %ymm12 +vpmullw %ymm1, %ymm10, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm2, %ymm9, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm3, %ymm8, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm4, %ymm7, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm5, %ymm6, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpsubw 8160(%rsp), %ymm12, %ymm12 +vpsubw 8544(%rsp), %ymm12, %ymm12 +vmovdqa %ymm12, 8352(%rsp) +vpmullw %ymm5, %ymm7, %ymm12 +vpmullw %ymm5, %ymm8, %ymm13 +vpmullw %ymm5, %ymm9, %ymm14 +vpmullw %ymm5, %ymm10, %ymm15 +vpmullw %ymm1, %ymm11, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm2, %ymm10, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm3, %ymm9, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm4, %ymm8, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm2, %ymm11, %ymm5 +vpaddw %ymm5, %ymm13, %ymm13 +vpmullw %ymm3, %ymm10, %ymm5 +vpaddw %ymm5, %ymm13, %ymm13 +vpmullw %ymm4, %ymm9, %ymm5 +vpaddw %ymm5, %ymm13, %ymm13 +vpmullw %ymm3, %ymm11, %ymm5 +vpaddw %ymm5, %ymm14, %ymm14 +vpmullw %ymm4, %ymm10, %ymm5 +vpaddw %ymm5, %ymm14, %ymm14 +vpmullw %ymm4, %ymm11, %ymm5 +vpaddw %ymm5, %ymm15, %ymm15 +vpmullw %ymm0, %ymm10, %ymm11 +vpmullw %ymm1, %ymm9, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm2, %ymm8, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm3, %ymm7, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm4, %ymm6, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm0, %ymm9, %ymm10 +vpmullw %ymm1, %ymm8, %ymm5 +vpaddw %ymm5, %ymm10, %ymm10 +vpmullw %ymm2, %ymm7, %ymm5 +vpaddw %ymm5, %ymm10, %ymm10 +vpmullw %ymm3, %ymm6, %ymm5 +vpaddw %ymm5, %ymm10, %ymm10 +vpmullw %ymm0, %ymm8, %ymm9 +vpmullw %ymm1, %ymm7, %ymm5 +vpaddw %ymm5, %ymm9, %ymm9 +vpmullw %ymm2, %ymm6, %ymm5 +vpaddw %ymm5, %ymm9, %ymm9 +vpmullw %ymm0, %ymm7, %ymm8 +vpmullw %ymm1, %ymm6, %ymm5 +vpaddw %ymm5, %ymm8, %ymm8 +vpmullw %ymm0, %ymm6, %ymm7 +vmovdqa 8192(%rsp), %ymm0 +vpsubw 8384(%rsp), %ymm0, %ymm0 +vpsubw %ymm0, %ymm12, %ymm6 +vpsubw 8576(%rsp), %ymm6, %ymm6 +vmovdqa %ymm6, 8384(%rsp) +vpaddw %ymm7, %ymm0, %ymm0 +vpsubw 8000(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 8192(%rsp) +vmovdqa 8224(%rsp), %ymm1 +vpsubw 8416(%rsp), %ymm1, %ymm1 +vpsubw %ymm1, %ymm13, %ymm7 +vpsubw 8608(%rsp), %ymm7, %ymm7 +vmovdqa %ymm7, 8416(%rsp) +vpaddw %ymm8, %ymm1, %ymm1 +vpsubw 8032(%rsp), %ymm1, %ymm1 +vmovdqa %ymm1, 8224(%rsp) +vmovdqa 8256(%rsp), %ymm2 +vpsubw 8448(%rsp), %ymm2, %ymm2 +vpsubw %ymm2, %ymm14, %ymm8 +vpsubw 8640(%rsp), %ymm8, %ymm8 +vmovdqa %ymm8, 8448(%rsp) +vpaddw %ymm9, %ymm2, %ymm2 +vpsubw 8064(%rsp), %ymm2, %ymm2 +vmovdqa %ymm2, 8256(%rsp) +vmovdqa 8288(%rsp), %ymm3 +vpsubw 8480(%rsp), %ymm3, %ymm3 +vpsubw %ymm3, %ymm15, %ymm9 +vmovdqa %ymm9, 8480(%rsp) +vpaddw %ymm10, %ymm3, %ymm3 +vpsubw 8096(%rsp), %ymm3, %ymm3 +vmovdqa %ymm3, 8288(%rsp) +vmovdqa 8320(%rsp), %ymm4 +vpsubw 8512(%rsp), %ymm4, %ymm4 +vpaddw %ymm11, %ymm4, %ymm4 +vpsubw 8128(%rsp), %ymm4, %ymm4 +vmovdqa %ymm4, 8320(%rsp) +vmovdqa 6944(%rsp), %ymm0 +vmovdqa 7648(%rsp), %ymm6 +vmovdqa 6976(%rsp), %ymm1 +vmovdqa 7680(%rsp), %ymm7 +vmovdqa 7008(%rsp), %ymm2 +vmovdqa 7712(%rsp), %ymm8 +vmovdqa 7040(%rsp), %ymm3 +vmovdqa 7744(%rsp), %ymm9 +vmovdqa 7072(%rsp), %ymm4 +vmovdqa 7776(%rsp), %ymm10 +vmovdqa 7104(%rsp), %ymm5 +vmovdqa 7808(%rsp), %ymm11 +vpmullw %ymm0, %ymm6, %ymm12 +vmovdqa %ymm12, 8704(%rsp) +vpmullw %ymm0, %ymm7, %ymm13 +vpmullw %ymm1, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 8736(%rsp) +vpmullw %ymm0, %ymm8, %ymm12 +vpmullw %ymm1, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 8768(%rsp) +vpmullw %ymm0, %ymm9, %ymm13 +vpmullw %ymm1, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm2, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 8800(%rsp) +vpmullw %ymm0, %ymm10, %ymm12 +vpmullw %ymm1, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm3, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 8832(%rsp) +vpmullw %ymm0, %ymm11, %ymm13 +vpmullw %ymm1, %ymm10, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm2, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm4, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm5, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 8864(%rsp) +vpmullw %ymm1, %ymm11, %ymm12 +vpmullw %ymm2, %ymm10, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm3, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm5, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 8896(%rsp) +vpmullw %ymm2, %ymm11, %ymm13 +vpmullw %ymm3, %ymm10, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm4, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm5, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 8928(%rsp) +vpmullw %ymm3, %ymm11, %ymm12 +vpmullw %ymm4, %ymm10, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm5, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 8960(%rsp) +vpmullw %ymm4, %ymm11, %ymm13 +vpmullw %ymm5, %ymm10, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 8992(%rsp) +vpmullw %ymm5, %ymm11, %ymm12 +vmovdqa %ymm12, 9024(%rsp) +vmovdqa 7136(%rsp), %ymm0 +vmovdqa 7840(%rsp), %ymm6 +vmovdqa 7168(%rsp), %ymm1 +vmovdqa 7872(%rsp), %ymm7 +vmovdqa 7200(%rsp), %ymm2 +vmovdqa 7904(%rsp), %ymm8 +vmovdqa 7232(%rsp), %ymm3 +vmovdqa 7936(%rsp), %ymm9 +vmovdqa 7264(%rsp), %ymm4 +vmovdqa 7968(%rsp), %ymm10 +vpmullw %ymm0, %ymm6, %ymm12 +vmovdqa %ymm12, 9088(%rsp) +vpmullw %ymm0, %ymm7, %ymm13 +vpmullw %ymm1, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 9120(%rsp) +vpmullw %ymm0, %ymm8, %ymm12 +vpmullw %ymm1, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 9152(%rsp) +vpmullw %ymm0, %ymm9, %ymm13 +vpmullw %ymm1, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm2, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 9184(%rsp) +vpmullw %ymm0, %ymm10, %ymm12 +vpmullw %ymm1, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm3, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 9216(%rsp) +vpmullw %ymm1, %ymm10, %ymm13 +vpmullw %ymm2, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm4, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 9248(%rsp) +vpmullw %ymm2, %ymm10, %ymm12 +vpmullw %ymm3, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 9280(%rsp) +vpmullw %ymm3, %ymm10, %ymm13 +vpmullw %ymm4, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 9312(%rsp) +vpmullw %ymm4, %ymm10, %ymm12 +vmovdqa %ymm12, 9344(%rsp) +vpaddw 6944(%rsp), %ymm0, %ymm0 +vpaddw 7648(%rsp), %ymm6, %ymm6 +vpaddw 6976(%rsp), %ymm1, %ymm1 +vpaddw 7680(%rsp), %ymm7, %ymm7 +vpaddw 7008(%rsp), %ymm2, %ymm2 +vpaddw 7712(%rsp), %ymm8, %ymm8 +vpaddw 7040(%rsp), %ymm3, %ymm3 +vpaddw 7744(%rsp), %ymm9, %ymm9 +vpaddw 7072(%rsp), %ymm4, %ymm4 +vpaddw 7776(%rsp), %ymm10, %ymm10 +vpmullw %ymm0, %ymm11, %ymm12 +vpmullw %ymm1, %ymm10, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm2, %ymm9, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm3, %ymm8, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm4, %ymm7, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm5, %ymm6, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpsubw 8864(%rsp), %ymm12, %ymm12 +vpsubw 9248(%rsp), %ymm12, %ymm12 +vmovdqa %ymm12, 9056(%rsp) +vpmullw %ymm5, %ymm7, %ymm12 +vpmullw %ymm5, %ymm8, %ymm13 +vpmullw %ymm5, %ymm9, %ymm14 +vpmullw %ymm5, %ymm10, %ymm15 +vpmullw %ymm1, %ymm11, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm2, %ymm10, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm3, %ymm9, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm4, %ymm8, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm2, %ymm11, %ymm5 +vpaddw %ymm5, %ymm13, %ymm13 +vpmullw %ymm3, %ymm10, %ymm5 +vpaddw %ymm5, %ymm13, %ymm13 +vpmullw %ymm4, %ymm9, %ymm5 +vpaddw %ymm5, %ymm13, %ymm13 +vpmullw %ymm3, %ymm11, %ymm5 +vpaddw %ymm5, %ymm14, %ymm14 +vpmullw %ymm4, %ymm10, %ymm5 +vpaddw %ymm5, %ymm14, %ymm14 +vpmullw %ymm4, %ymm11, %ymm5 +vpaddw %ymm5, %ymm15, %ymm15 +vpmullw %ymm0, %ymm10, %ymm11 +vpmullw %ymm1, %ymm9, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm2, %ymm8, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm3, %ymm7, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm4, %ymm6, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm0, %ymm9, %ymm10 +vpmullw %ymm1, %ymm8, %ymm5 +vpaddw %ymm5, %ymm10, %ymm10 +vpmullw %ymm2, %ymm7, %ymm5 +vpaddw %ymm5, %ymm10, %ymm10 +vpmullw %ymm3, %ymm6, %ymm5 +vpaddw %ymm5, %ymm10, %ymm10 +vpmullw %ymm0, %ymm8, %ymm9 +vpmullw %ymm1, %ymm7, %ymm5 +vpaddw %ymm5, %ymm9, %ymm9 +vpmullw %ymm2, %ymm6, %ymm5 +vpaddw %ymm5, %ymm9, %ymm9 +vpmullw %ymm0, %ymm7, %ymm8 +vpmullw %ymm1, %ymm6, %ymm5 +vpaddw %ymm5, %ymm8, %ymm8 +vpmullw %ymm0, %ymm6, %ymm7 +vmovdqa 8896(%rsp), %ymm0 +vpsubw 9088(%rsp), %ymm0, %ymm0 +vpsubw %ymm0, %ymm12, %ymm6 +vpsubw 9280(%rsp), %ymm6, %ymm6 +vmovdqa %ymm6, 9088(%rsp) +vpaddw %ymm7, %ymm0, %ymm0 +vpsubw 8704(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 8896(%rsp) +vmovdqa 8928(%rsp), %ymm1 +vpsubw 9120(%rsp), %ymm1, %ymm1 +vpsubw %ymm1, %ymm13, %ymm7 +vpsubw 9312(%rsp), %ymm7, %ymm7 +vmovdqa %ymm7, 9120(%rsp) +vpaddw %ymm8, %ymm1, %ymm1 +vpsubw 8736(%rsp), %ymm1, %ymm1 +vmovdqa %ymm1, 8928(%rsp) +vmovdqa 8960(%rsp), %ymm2 +vpsubw 9152(%rsp), %ymm2, %ymm2 +vpsubw %ymm2, %ymm14, %ymm8 +vpsubw 9344(%rsp), %ymm8, %ymm8 +vmovdqa %ymm8, 9152(%rsp) +vpaddw %ymm9, %ymm2, %ymm2 +vpsubw 8768(%rsp), %ymm2, %ymm2 +vmovdqa %ymm2, 8960(%rsp) +vmovdqa 8992(%rsp), %ymm3 +vpsubw 9184(%rsp), %ymm3, %ymm3 +vpsubw %ymm3, %ymm15, %ymm9 +vmovdqa %ymm9, 9184(%rsp) +vpaddw %ymm10, %ymm3, %ymm3 +vpsubw 8800(%rsp), %ymm3, %ymm3 +vmovdqa %ymm3, 8992(%rsp) +vmovdqa 9024(%rsp), %ymm4 +vpsubw 9216(%rsp), %ymm4, %ymm4 +vpaddw %ymm11, %ymm4, %ymm4 +vpsubw 8832(%rsp), %ymm4, %ymm4 +vmovdqa %ymm4, 9024(%rsp) +vmovdqa 6592(%rsp), %ymm0 +vmovdqa 7296(%rsp), %ymm6 +vpaddw 6944(%rsp), %ymm0, %ymm0 +vpaddw 7648(%rsp), %ymm6, %ymm6 +vmovdqa 6624(%rsp), %ymm1 +vmovdqa 7328(%rsp), %ymm7 +vpaddw 6976(%rsp), %ymm1, %ymm1 +vpaddw 7680(%rsp), %ymm7, %ymm7 +vmovdqa 6656(%rsp), %ymm2 +vmovdqa 7360(%rsp), %ymm8 +vpaddw 7008(%rsp), %ymm2, %ymm2 +vpaddw 7712(%rsp), %ymm8, %ymm8 +vmovdqa 6688(%rsp), %ymm3 +vmovdqa 7392(%rsp), %ymm9 +vpaddw 7040(%rsp), %ymm3, %ymm3 +vpaddw 7744(%rsp), %ymm9, %ymm9 +vmovdqa 6720(%rsp), %ymm4 +vmovdqa 7424(%rsp), %ymm10 +vpaddw 7072(%rsp), %ymm4, %ymm4 +vpaddw 7776(%rsp), %ymm10, %ymm10 +vmovdqa 6752(%rsp), %ymm5 +vmovdqa 7456(%rsp), %ymm11 +vpaddw 7104(%rsp), %ymm5, %ymm5 +vpaddw 7808(%rsp), %ymm11, %ymm11 +vpmullw %ymm0, %ymm6, %ymm12 +vmovdqa %ymm12, 5888(%rsp) +vpmullw %ymm0, %ymm7, %ymm13 +vpmullw %ymm1, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 5920(%rsp) +vpmullw %ymm0, %ymm8, %ymm12 +vpmullw %ymm1, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 5952(%rsp) +vpmullw %ymm0, %ymm9, %ymm13 +vpmullw %ymm1, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm2, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 5984(%rsp) +vpmullw %ymm0, %ymm10, %ymm12 +vpmullw %ymm1, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm3, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 6016(%rsp) +vpmullw %ymm0, %ymm11, %ymm13 +vpmullw %ymm1, %ymm10, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm2, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm4, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm5, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 6048(%rsp) +vpmullw %ymm1, %ymm11, %ymm12 +vpmullw %ymm2, %ymm10, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm3, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm5, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 6080(%rsp) +vpmullw %ymm2, %ymm11, %ymm13 +vpmullw %ymm3, %ymm10, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm4, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm5, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 6112(%rsp) +vpmullw %ymm3, %ymm11, %ymm12 +vpmullw %ymm4, %ymm10, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm5, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 6144(%rsp) +vpmullw %ymm4, %ymm11, %ymm13 +vpmullw %ymm5, %ymm10, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 6176(%rsp) +vpmullw %ymm5, %ymm11, %ymm12 +vmovdqa %ymm12, 6208(%rsp) +vmovdqa 6784(%rsp), %ymm0 +vmovdqa 7488(%rsp), %ymm6 +vpaddw 7136(%rsp), %ymm0, %ymm0 +vpaddw 7840(%rsp), %ymm6, %ymm6 +vmovdqa 6816(%rsp), %ymm1 +vmovdqa 7520(%rsp), %ymm7 +vpaddw 7168(%rsp), %ymm1, %ymm1 +vpaddw 7872(%rsp), %ymm7, %ymm7 +vmovdqa 6848(%rsp), %ymm2 +vmovdqa 7552(%rsp), %ymm8 +vpaddw 7200(%rsp), %ymm2, %ymm2 +vpaddw 7904(%rsp), %ymm8, %ymm8 +vmovdqa 6880(%rsp), %ymm3 +vmovdqa 7584(%rsp), %ymm9 +vpaddw 7232(%rsp), %ymm3, %ymm3 +vpaddw 7936(%rsp), %ymm9, %ymm9 +vmovdqa 6912(%rsp), %ymm4 +vmovdqa 7616(%rsp), %ymm10 +vpaddw 7264(%rsp), %ymm4, %ymm4 +vpaddw 7968(%rsp), %ymm10, %ymm10 +vpmullw %ymm0, %ymm6, %ymm12 +vmovdqa %ymm12, 6272(%rsp) +vpmullw %ymm0, %ymm7, %ymm13 +vpmullw %ymm1, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 6304(%rsp) +vpmullw %ymm0, %ymm8, %ymm12 +vpmullw %ymm1, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 6336(%rsp) +vpmullw %ymm0, %ymm9, %ymm13 +vpmullw %ymm1, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm2, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 6368(%rsp) +vpmullw %ymm0, %ymm10, %ymm12 +vpmullw %ymm1, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm3, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 6400(%rsp) +vpmullw %ymm1, %ymm10, %ymm13 +vpmullw %ymm2, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm4, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 6432(%rsp) +vpmullw %ymm2, %ymm10, %ymm12 +vpmullw %ymm3, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 6464(%rsp) +vpmullw %ymm3, %ymm10, %ymm13 +vpmullw %ymm4, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 6496(%rsp) +vpmullw %ymm4, %ymm10, %ymm12 +vmovdqa %ymm12, 6528(%rsp) +vpaddw 6592(%rsp), %ymm0, %ymm0 +vpaddw 7296(%rsp), %ymm6, %ymm6 +vpaddw 6944(%rsp), %ymm0, %ymm0 +vpaddw 7648(%rsp), %ymm6, %ymm6 +vpaddw 6624(%rsp), %ymm1, %ymm1 +vpaddw 7328(%rsp), %ymm7, %ymm7 +vpaddw 6976(%rsp), %ymm1, %ymm1 +vpaddw 7680(%rsp), %ymm7, %ymm7 +vpaddw 6656(%rsp), %ymm2, %ymm2 +vpaddw 7360(%rsp), %ymm8, %ymm8 +vpaddw 7008(%rsp), %ymm2, %ymm2 +vpaddw 7712(%rsp), %ymm8, %ymm8 +vpaddw 6688(%rsp), %ymm3, %ymm3 +vpaddw 7392(%rsp), %ymm9, %ymm9 +vpaddw 7040(%rsp), %ymm3, %ymm3 +vpaddw 7744(%rsp), %ymm9, %ymm9 +vpaddw 6720(%rsp), %ymm4, %ymm4 +vpaddw 7424(%rsp), %ymm10, %ymm10 +vpaddw 7072(%rsp), %ymm4, %ymm4 +vpaddw 7776(%rsp), %ymm10, %ymm10 +vpmullw %ymm0, %ymm11, %ymm12 +vpmullw %ymm1, %ymm10, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm2, %ymm9, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm3, %ymm8, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm4, %ymm7, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm5, %ymm6, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpsubw 6048(%rsp), %ymm12, %ymm12 +vpsubw 6432(%rsp), %ymm12, %ymm12 +vmovdqa %ymm12, 6240(%rsp) +vpmullw %ymm5, %ymm7, %ymm12 +vpmullw %ymm5, %ymm8, %ymm13 +vpmullw %ymm5, %ymm9, %ymm14 +vpmullw %ymm5, %ymm10, %ymm15 +vpmullw %ymm1, %ymm11, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm2, %ymm10, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm3, %ymm9, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm4, %ymm8, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm2, %ymm11, %ymm5 +vpaddw %ymm5, %ymm13, %ymm13 +vpmullw %ymm3, %ymm10, %ymm5 +vpaddw %ymm5, %ymm13, %ymm13 +vpmullw %ymm4, %ymm9, %ymm5 +vpaddw %ymm5, %ymm13, %ymm13 +vpmullw %ymm3, %ymm11, %ymm5 +vpaddw %ymm5, %ymm14, %ymm14 +vpmullw %ymm4, %ymm10, %ymm5 +vpaddw %ymm5, %ymm14, %ymm14 +vpmullw %ymm4, %ymm11, %ymm5 +vpaddw %ymm5, %ymm15, %ymm15 +vpmullw %ymm0, %ymm10, %ymm11 +vpmullw %ymm1, %ymm9, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm2, %ymm8, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm3, %ymm7, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm4, %ymm6, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm0, %ymm9, %ymm10 +vpmullw %ymm1, %ymm8, %ymm5 +vpaddw %ymm5, %ymm10, %ymm10 +vpmullw %ymm2, %ymm7, %ymm5 +vpaddw %ymm5, %ymm10, %ymm10 +vpmullw %ymm3, %ymm6, %ymm5 +vpaddw %ymm5, %ymm10, %ymm10 +vpmullw %ymm0, %ymm8, %ymm9 +vpmullw %ymm1, %ymm7, %ymm5 +vpaddw %ymm5, %ymm9, %ymm9 +vpmullw %ymm2, %ymm6, %ymm5 +vpaddw %ymm5, %ymm9, %ymm9 +vpmullw %ymm0, %ymm7, %ymm8 +vpmullw %ymm1, %ymm6, %ymm5 +vpaddw %ymm5, %ymm8, %ymm8 +vpmullw %ymm0, %ymm6, %ymm7 +vmovdqa 6080(%rsp), %ymm0 +vpsubw 6272(%rsp), %ymm0, %ymm0 +vpsubw %ymm0, %ymm12, %ymm6 +vpsubw 6464(%rsp), %ymm6, %ymm6 +vmovdqa %ymm6, 6272(%rsp) +vpaddw %ymm7, %ymm0, %ymm0 +vpsubw 5888(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 6080(%rsp) +vmovdqa 6112(%rsp), %ymm1 +vpsubw 6304(%rsp), %ymm1, %ymm1 +vpsubw %ymm1, %ymm13, %ymm7 +vpsubw 6496(%rsp), %ymm7, %ymm7 +vmovdqa %ymm7, 6304(%rsp) +vpaddw %ymm8, %ymm1, %ymm1 +vpsubw 5920(%rsp), %ymm1, %ymm1 +vmovdqa %ymm1, 6112(%rsp) +vmovdqa 6144(%rsp), %ymm2 +vpsubw 6336(%rsp), %ymm2, %ymm2 +vpsubw %ymm2, %ymm14, %ymm8 +vpsubw 6528(%rsp), %ymm8, %ymm8 +vmovdqa %ymm8, 6336(%rsp) +vpaddw %ymm9, %ymm2, %ymm2 +vpsubw 5952(%rsp), %ymm2, %ymm2 +vmovdqa %ymm2, 6144(%rsp) +vmovdqa 6176(%rsp), %ymm3 +vpsubw 6368(%rsp), %ymm3, %ymm3 +vpsubw %ymm3, %ymm15, %ymm9 +vmovdqa %ymm9, 6368(%rsp) +vpaddw %ymm10, %ymm3, %ymm3 +vpsubw 5984(%rsp), %ymm3, %ymm3 +vmovdqa %ymm3, 6176(%rsp) +vmovdqa 6208(%rsp), %ymm4 +vpsubw 6400(%rsp), %ymm4, %ymm4 +vpaddw %ymm11, %ymm4, %ymm4 +vpsubw 6016(%rsp), %ymm4, %ymm4 +vmovdqa %ymm4, 6208(%rsp) +vmovdqa 8352(%rsp), %ymm0 +vpsubw 8704(%rsp), %ymm0, %ymm0 +vmovdqa 6240(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 9056(%rsp), %ymm1, %ymm6 +vpsubw 8000(%rsp), %ymm0, %ymm0 +vpaddw 5888(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 8352(%rsp) +vmovdqa 8384(%rsp), %ymm0 +vpsubw 8736(%rsp), %ymm0, %ymm0 +vmovdqa 6272(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 9088(%rsp), %ymm1, %ymm7 +vpsubw 8032(%rsp), %ymm0, %ymm0 +vpaddw 5920(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 8384(%rsp) +vmovdqa 8416(%rsp), %ymm0 +vpsubw 8768(%rsp), %ymm0, %ymm0 +vmovdqa 6304(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 9120(%rsp), %ymm1, %ymm8 +vpsubw 8064(%rsp), %ymm0, %ymm0 +vpaddw 5952(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 8416(%rsp) +vmovdqa 8448(%rsp), %ymm0 +vpsubw 8800(%rsp), %ymm0, %ymm0 +vmovdqa 6336(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 9152(%rsp), %ymm1, %ymm9 +vpsubw 8096(%rsp), %ymm0, %ymm0 +vpaddw 5984(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 8448(%rsp) +vmovdqa 8480(%rsp), %ymm0 +vpsubw 8832(%rsp), %ymm0, %ymm0 +vmovdqa 6368(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 9184(%rsp), %ymm1, %ymm10 +vpsubw 8128(%rsp), %ymm0, %ymm0 +vpaddw 6016(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 8480(%rsp) +vmovdqa 8512(%rsp), %ymm0 +vpsubw 8864(%rsp), %ymm0, %ymm0 +vmovdqa 6400(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 9216(%rsp), %ymm1, %ymm11 +vpsubw 8160(%rsp), %ymm0, %ymm0 +vpaddw 6048(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 8512(%rsp) +vmovdqa 8544(%rsp), %ymm0 +vpsubw 8896(%rsp), %ymm0, %ymm0 +vmovdqa 6432(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 9248(%rsp), %ymm1, %ymm12 +vpsubw 8192(%rsp), %ymm0, %ymm0 +vpaddw 6080(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 8544(%rsp) +vmovdqa 8576(%rsp), %ymm0 +vpsubw 8928(%rsp), %ymm0, %ymm0 +vmovdqa 6464(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 9280(%rsp), %ymm1, %ymm13 +vpsubw 8224(%rsp), %ymm0, %ymm0 +vpaddw 6112(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 8576(%rsp) +vmovdqa 8608(%rsp), %ymm0 +vpsubw 8960(%rsp), %ymm0, %ymm0 +vmovdqa 6496(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 9312(%rsp), %ymm1, %ymm14 +vpsubw 8256(%rsp), %ymm0, %ymm0 +vpaddw 6144(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 8608(%rsp) +vmovdqa 8640(%rsp), %ymm0 +vpsubw 8992(%rsp), %ymm0, %ymm0 +vmovdqa 6528(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 9344(%rsp), %ymm1, %ymm15 +vpsubw 8288(%rsp), %ymm0, %ymm0 +vpaddw 6176(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 8640(%rsp) +vmovdqa 6208(%rsp), %ymm0 +vpsubw 8320(%rsp), %ymm0, %ymm0 +vpsubw 9024(%rsp), %ymm0, %ymm0 +vpsubw 3488(%r10), %ymm0, %ymm0 +vpsubw 4896(%r10), %ymm0, %ymm0 +vmovdqa %ymm0, 4192(%r10) +vmovdqa 3520(%r10), %ymm0 +vpsubw 4224(%r10), %ymm0, %ymm0 +vpsubw %ymm0, %ymm6, %ymm6 +vpsubw 4928(%r10), %ymm6, %ymm6 +vpsubw 2816(%r10), %ymm0, %ymm0 +vpaddw 8000(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3520(%r10) +vmovdqa %ymm6, 4224(%r10) +vmovdqa 3552(%r10), %ymm0 +vpsubw 4256(%r10), %ymm0, %ymm0 +vpsubw %ymm0, %ymm7, %ymm7 +vpsubw 4960(%r10), %ymm7, %ymm7 +vpsubw 2848(%r10), %ymm0, %ymm0 +vpaddw 8032(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3552(%r10) +vmovdqa %ymm7, 4256(%r10) +vmovdqa 3584(%r10), %ymm0 +vpsubw 4288(%r10), %ymm0, %ymm0 +vpsubw %ymm0, %ymm8, %ymm8 +vpsubw 4992(%r10), %ymm8, %ymm8 +vpsubw 2880(%r10), %ymm0, %ymm0 +vpaddw 8064(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3584(%r10) +vmovdqa %ymm8, 4288(%r10) +vmovdqa 3616(%r10), %ymm0 +vpsubw 4320(%r10), %ymm0, %ymm0 +vpsubw %ymm0, %ymm9, %ymm9 +vpsubw 5024(%r10), %ymm9, %ymm9 +vpsubw 2912(%r10), %ymm0, %ymm0 +vpaddw 8096(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3616(%r10) +vmovdqa %ymm9, 4320(%r10) +vmovdqa 3648(%r10), %ymm0 +vpsubw 4352(%r10), %ymm0, %ymm0 +vpsubw %ymm0, %ymm10, %ymm10 +vpsubw 5056(%r10), %ymm10, %ymm10 +vpsubw 2944(%r10), %ymm0, %ymm0 +vpaddw 8128(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3648(%r10) +vmovdqa %ymm10, 4352(%r10) +vmovdqa 3680(%r10), %ymm0 +vpsubw 4384(%r10), %ymm0, %ymm0 +vpsubw %ymm0, %ymm11, %ymm11 +vpsubw 5088(%r10), %ymm11, %ymm11 +vpsubw 2976(%r10), %ymm0, %ymm0 +vpaddw 8160(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3680(%r10) +vmovdqa %ymm11, 4384(%r10) +vmovdqa 3712(%r10), %ymm0 +vpsubw 4416(%r10), %ymm0, %ymm0 +vpsubw %ymm0, %ymm12, %ymm12 +vpsubw 5120(%r10), %ymm12, %ymm12 +vpsubw 3008(%r10), %ymm0, %ymm0 +vpaddw 8192(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3712(%r10) +vmovdqa %ymm12, 4416(%r10) +vmovdqa 3744(%r10), %ymm0 +vpsubw 4448(%r10), %ymm0, %ymm0 +vpsubw %ymm0, %ymm13, %ymm13 +vpsubw 5152(%r10), %ymm13, %ymm13 +vpsubw 3040(%r10), %ymm0, %ymm0 +vpaddw 8224(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3744(%r10) +vmovdqa %ymm13, 4448(%r10) +vmovdqa 3776(%r10), %ymm0 +vpsubw 4480(%r10), %ymm0, %ymm0 +vpsubw %ymm0, %ymm14, %ymm14 +vpsubw 5184(%r10), %ymm14, %ymm14 +vpsubw 3072(%r10), %ymm0, %ymm0 +vpaddw 8256(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3776(%r10) +vmovdqa %ymm14, 4480(%r10) +vmovdqa 3808(%r10), %ymm0 +vpsubw 4512(%r10), %ymm0, %ymm0 +vpsubw %ymm0, %ymm15, %ymm15 +vpsubw 5216(%r10), %ymm15, %ymm15 +vpsubw 3104(%r10), %ymm0, %ymm0 +vpaddw 8288(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3808(%r10) +vmovdqa %ymm15, 4512(%r10) +vmovdqa 3840(%r10), %ymm0 +vpsubw 4544(%r10), %ymm0, %ymm0 +vmovdqa 9024(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 5248(%r10), %ymm1, %ymm1 +vpsubw 3136(%r10), %ymm0, %ymm0 +vpaddw 8320(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3840(%r10) +vmovdqa %ymm1, 4544(%r10) +vmovdqa 3872(%r10), %ymm0 +vpsubw 4576(%r10), %ymm0, %ymm0 +vmovdqa 9056(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 5280(%r10), %ymm1, %ymm1 +vpsubw 3168(%r10), %ymm0, %ymm0 +vpaddw 8352(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3872(%r10) +vmovdqa %ymm1, 4576(%r10) +vmovdqa 3904(%r10), %ymm0 +vpsubw 4608(%r10), %ymm0, %ymm0 +vmovdqa 9088(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 5312(%r10), %ymm1, %ymm1 +vpsubw 3200(%r10), %ymm0, %ymm0 +vpaddw 8384(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3904(%r10) +vmovdqa %ymm1, 4608(%r10) +vmovdqa 3936(%r10), %ymm0 +vpsubw 4640(%r10), %ymm0, %ymm0 +vmovdqa 9120(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 5344(%r10), %ymm1, %ymm1 +vpsubw 3232(%r10), %ymm0, %ymm0 +vpaddw 8416(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3936(%r10) +vmovdqa %ymm1, 4640(%r10) +vmovdqa 3968(%r10), %ymm0 +vpsubw 4672(%r10), %ymm0, %ymm0 +vmovdqa 9152(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 5376(%r10), %ymm1, %ymm1 +vpsubw 3264(%r10), %ymm0, %ymm0 +vpaddw 8448(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3968(%r10) +vmovdqa %ymm1, 4672(%r10) +vmovdqa 4000(%r10), %ymm0 +vpsubw 4704(%r10), %ymm0, %ymm0 +vmovdqa 9184(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 5408(%r10), %ymm1, %ymm1 +vpsubw 3296(%r10), %ymm0, %ymm0 +vpaddw 8480(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 4000(%r10) +vmovdqa %ymm1, 4704(%r10) +vmovdqa 4032(%r10), %ymm0 +vpsubw 4736(%r10), %ymm0, %ymm0 +vmovdqa 9216(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 5440(%r10), %ymm1, %ymm1 +vpsubw 3328(%r10), %ymm0, %ymm0 +vpaddw 8512(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 4032(%r10) +vmovdqa %ymm1, 4736(%r10) +vmovdqa 4064(%r10), %ymm0 +vpsubw 4768(%r10), %ymm0, %ymm0 +vmovdqa 9248(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 5472(%r10), %ymm1, %ymm1 +vpsubw 3360(%r10), %ymm0, %ymm0 +vpaddw 8544(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 4064(%r10) +vmovdqa %ymm1, 4768(%r10) +vmovdqa 4096(%r10), %ymm0 +vpsubw 4800(%r10), %ymm0, %ymm0 +vmovdqa 9280(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 5504(%r10), %ymm1, %ymm1 +vpsubw 3392(%r10), %ymm0, %ymm0 +vpaddw 8576(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 4096(%r10) +vmovdqa %ymm1, 4800(%r10) +vmovdqa 4128(%r10), %ymm0 +vpsubw 4832(%r10), %ymm0, %ymm0 +vmovdqa 9312(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 5536(%r10), %ymm1, %ymm1 +vpsubw 3424(%r10), %ymm0, %ymm0 +vpaddw 8608(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 4128(%r10) +vmovdqa %ymm1, 4832(%r10) +vmovdqa 4160(%r10), %ymm0 +vpsubw 4864(%r10), %ymm0, %ymm0 +vmovdqa 9344(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 5568(%r10), %ymm1, %ymm1 +vpsubw 3456(%r10), %ymm0, %ymm0 +vpaddw 8640(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 4160(%r10) +vmovdqa %ymm1, 4864(%r10) +vpxor %ymm1, %ymm1, %ymm1 +vmovdqa %ymm1, 5600(%r10) +subq $32, %rsp +vmovdqa 2816(%r10), %ymm0 +vmovdqa 2880(%r10), %ymm1 +vmovdqa 2944(%r10), %ymm2 +vmovdqa 3008(%r10), %ymm3 +vpunpcklwd 2848(%r10), %ymm0, %ymm4 +vpunpckhwd 2848(%r10), %ymm0, %ymm5 +vpunpcklwd 2912(%r10), %ymm1, %ymm6 +vpunpckhwd 2912(%r10), %ymm1, %ymm7 +vpunpcklwd 2976(%r10), %ymm2, %ymm8 +vpunpckhwd 2976(%r10), %ymm2, %ymm9 +vpunpcklwd 3040(%r10), %ymm3, %ymm10 +vpunpckhwd 3040(%r10), %ymm3, %ymm11 +vpunpckldq %ymm6, %ymm4, %ymm0 +vpunpckhdq %ymm6, %ymm4, %ymm1 +vpunpckldq %ymm7, %ymm5, %ymm2 +vpunpckhdq %ymm7, %ymm5, %ymm3 +vpunpckldq %ymm10, %ymm8, %ymm12 +vpunpckhdq %ymm10, %ymm8, %ymm13 +vpunpckldq %ymm11, %ymm9, %ymm14 +vpunpckhdq %ymm11, %ymm9, %ymm15 +vpunpcklqdq %ymm12, %ymm0, %ymm4 +vpunpckhqdq %ymm12, %ymm0, %ymm5 +vpunpcklqdq %ymm13, %ymm1, %ymm6 +vpunpckhqdq %ymm13, %ymm1, %ymm7 +vpunpcklqdq %ymm14, %ymm2, %ymm8 +vpunpckhqdq %ymm14, %ymm2, %ymm9 +vpunpcklqdq %ymm15, %ymm3, %ymm10 +vpunpckhqdq %ymm15, %ymm3, %ymm11 +vmovdqa 3072(%r10), %ymm0 +vmovdqa 3136(%r10), %ymm1 +vmovdqa 3200(%r10), %ymm2 +vmovdqa 3264(%r10), %ymm3 +vpunpcklwd 3104(%r10), %ymm0, %ymm12 +vpunpckhwd 3104(%r10), %ymm0, %ymm13 +vpunpcklwd 3168(%r10), %ymm1, %ymm14 +vpunpckhwd 3168(%r10), %ymm1, %ymm15 +vpunpcklwd 3232(%r10), %ymm2, %ymm0 +vpunpckhwd 3232(%r10), %ymm2, %ymm1 +vpunpcklwd 3296(%r10), %ymm3, %ymm2 +vpunpckhwd 3296(%r10), %ymm3, %ymm3 +vmovdqa %ymm11, 0(%rsp) +vpunpckldq %ymm14, %ymm12, %ymm11 +vpunpckhdq %ymm14, %ymm12, %ymm12 +vpunpckldq %ymm15, %ymm13, %ymm14 +vpunpckhdq %ymm15, %ymm13, %ymm15 +vpunpckldq %ymm2, %ymm0, %ymm13 +vpunpckhdq %ymm2, %ymm0, %ymm0 +vpunpckldq %ymm3, %ymm1, %ymm2 +vpunpckhdq %ymm3, %ymm1, %ymm1 +vpunpcklqdq %ymm13, %ymm11, %ymm3 +vpunpckhqdq %ymm13, %ymm11, %ymm13 +vpunpcklqdq %ymm0, %ymm12, %ymm11 +vpunpckhqdq %ymm0, %ymm12, %ymm0 +vpunpcklqdq %ymm2, %ymm14, %ymm12 +vpunpckhqdq %ymm2, %ymm14, %ymm2 +vpunpcklqdq %ymm1, %ymm15, %ymm14 +vpunpckhqdq %ymm1, %ymm15, %ymm1 +vinserti128 $1, %xmm3, %ymm4, %ymm15 +vmovdqa %ymm15, 0(%r12) +vinserti128 $1, %xmm13, %ymm5, %ymm15 +vmovdqa %ymm15, 192(%r12) +vinserti128 $1, %xmm11, %ymm6, %ymm15 +vmovdqa %ymm15, 384(%r12) +vinserti128 $1, %xmm0, %ymm7, %ymm15 +vmovdqa %ymm15, 576(%r12) +vinserti128 $1, %xmm12, %ymm8, %ymm15 +vmovdqa %ymm15, 768(%r12) +vinserti128 $1, %xmm2, %ymm9, %ymm15 +vmovdqa %ymm15, 960(%r12) +vinserti128 $1, %xmm14, %ymm10, %ymm15 +vmovdqa %ymm15, 1152(%r12) +vpermq $78, %ymm4, %ymm4 +vpermq $78, %ymm5, %ymm5 +vpermq $78, %ymm6, %ymm6 +vpermq $78, %ymm7, %ymm7 +vpermq $78, %ymm8, %ymm8 +vpermq $78, %ymm9, %ymm9 +vpermq $78, %ymm10, %ymm10 +vinserti128 $0, %xmm4, %ymm3, %ymm15 +vmovdqa %ymm15, 1536(%r12) +vinserti128 $0, %xmm5, %ymm13, %ymm15 +vmovdqa %ymm15, 1728(%r12) +vinserti128 $0, %xmm6, %ymm11, %ymm15 +vmovdqa %ymm15, 1920(%r12) +vinserti128 $0, %xmm7, %ymm0, %ymm15 +vmovdqa %ymm15, 2112(%r12) +vinserti128 $0, %xmm8, %ymm12, %ymm15 +vmovdqa %ymm15, 2304(%r12) +vinserti128 $0, %xmm9, %ymm2, %ymm15 +vmovdqa %ymm15, 2496(%r12) +vinserti128 $0, %xmm10, %ymm14, %ymm15 +vmovdqa %ymm15, 2688(%r12) +vmovdqa 0(%rsp), %ymm11 +vinserti128 $1, %xmm1, %ymm11, %ymm14 +vmovdqa %ymm14, 1344(%r12) +vpermq $78, %ymm11, %ymm11 +vinserti128 $0, %xmm11, %ymm1, %ymm1 +vmovdqa %ymm1, 2880(%r12) +vmovdqa 3328(%r10), %ymm0 +vmovdqa 3392(%r10), %ymm1 +vmovdqa 3456(%r10), %ymm2 +vmovdqa 3520(%r10), %ymm3 +vpunpcklwd 3360(%r10), %ymm0, %ymm4 +vpunpckhwd 3360(%r10), %ymm0, %ymm5 +vpunpcklwd 3424(%r10), %ymm1, %ymm6 +vpunpckhwd 3424(%r10), %ymm1, %ymm7 +vpunpcklwd 3488(%r10), %ymm2, %ymm8 +vpunpckhwd 3488(%r10), %ymm2, %ymm9 +vpunpcklwd 3552(%r10), %ymm3, %ymm10 +vpunpckhwd 3552(%r10), %ymm3, %ymm11 +vpunpckldq %ymm6, %ymm4, %ymm0 +vpunpckhdq %ymm6, %ymm4, %ymm1 +vpunpckldq %ymm7, %ymm5, %ymm2 +vpunpckhdq %ymm7, %ymm5, %ymm3 +vpunpckldq %ymm10, %ymm8, %ymm12 +vpunpckhdq %ymm10, %ymm8, %ymm13 +vpunpckldq %ymm11, %ymm9, %ymm14 +vpunpckhdq %ymm11, %ymm9, %ymm15 +vpunpcklqdq %ymm12, %ymm0, %ymm4 +vpunpckhqdq %ymm12, %ymm0, %ymm5 +vpunpcklqdq %ymm13, %ymm1, %ymm6 +vpunpckhqdq %ymm13, %ymm1, %ymm7 +vpunpcklqdq %ymm14, %ymm2, %ymm8 +vpunpckhqdq %ymm14, %ymm2, %ymm9 +vpunpcklqdq %ymm15, %ymm3, %ymm10 +vpunpckhqdq %ymm15, %ymm3, %ymm11 +vmovdqa 3584(%r10), %ymm0 +vmovdqa 3648(%r10), %ymm1 +vmovdqa 3712(%r10), %ymm2 +vmovdqa 3776(%r10), %ymm3 +vpunpcklwd 3616(%r10), %ymm0, %ymm12 +vpunpckhwd 3616(%r10), %ymm0, %ymm13 +vpunpcklwd 3680(%r10), %ymm1, %ymm14 +vpunpckhwd 3680(%r10), %ymm1, %ymm15 +vpunpcklwd 3744(%r10), %ymm2, %ymm0 +vpunpckhwd 3744(%r10), %ymm2, %ymm1 +vpunpcklwd 3808(%r10), %ymm3, %ymm2 +vpunpckhwd 3808(%r10), %ymm3, %ymm3 +vmovdqa %ymm11, 0(%rsp) +vpunpckldq %ymm14, %ymm12, %ymm11 +vpunpckhdq %ymm14, %ymm12, %ymm12 +vpunpckldq %ymm15, %ymm13, %ymm14 +vpunpckhdq %ymm15, %ymm13, %ymm15 +vpunpckldq %ymm2, %ymm0, %ymm13 +vpunpckhdq %ymm2, %ymm0, %ymm0 +vpunpckldq %ymm3, %ymm1, %ymm2 +vpunpckhdq %ymm3, %ymm1, %ymm1 +vpunpcklqdq %ymm13, %ymm11, %ymm3 +vpunpckhqdq %ymm13, %ymm11, %ymm13 +vpunpcklqdq %ymm0, %ymm12, %ymm11 +vpunpckhqdq %ymm0, %ymm12, %ymm0 +vpunpcklqdq %ymm2, %ymm14, %ymm12 +vpunpckhqdq %ymm2, %ymm14, %ymm2 +vpunpcklqdq %ymm1, %ymm15, %ymm14 +vpunpckhqdq %ymm1, %ymm15, %ymm1 +vinserti128 $1, %xmm3, %ymm4, %ymm15 +vmovdqa %ymm15, 32(%r12) +vinserti128 $1, %xmm13, %ymm5, %ymm15 +vmovdqa %ymm15, 224(%r12) +vinserti128 $1, %xmm11, %ymm6, %ymm15 +vmovdqa %ymm15, 416(%r12) +vinserti128 $1, %xmm0, %ymm7, %ymm15 +vmovdqa %ymm15, 608(%r12) +vinserti128 $1, %xmm12, %ymm8, %ymm15 +vmovdqa %ymm15, 800(%r12) +vinserti128 $1, %xmm2, %ymm9, %ymm15 +vmovdqa %ymm15, 992(%r12) +vinserti128 $1, %xmm14, %ymm10, %ymm15 +vmovdqa %ymm15, 1184(%r12) +vpermq $78, %ymm4, %ymm4 +vpermq $78, %ymm5, %ymm5 +vpermq $78, %ymm6, %ymm6 +vpermq $78, %ymm7, %ymm7 +vpermq $78, %ymm8, %ymm8 +vpermq $78, %ymm9, %ymm9 +vpermq $78, %ymm10, %ymm10 +vinserti128 $0, %xmm4, %ymm3, %ymm15 +vmovdqa %ymm15, 1568(%r12) +vinserti128 $0, %xmm5, %ymm13, %ymm15 +vmovdqa %ymm15, 1760(%r12) +vinserti128 $0, %xmm6, %ymm11, %ymm15 +vmovdqa %ymm15, 1952(%r12) +vinserti128 $0, %xmm7, %ymm0, %ymm15 +vmovdqa %ymm15, 2144(%r12) +vinserti128 $0, %xmm8, %ymm12, %ymm15 +vmovdqa %ymm15, 2336(%r12) +vinserti128 $0, %xmm9, %ymm2, %ymm15 +vmovdqa %ymm15, 2528(%r12) +vinserti128 $0, %xmm10, %ymm14, %ymm15 +vmovdqa %ymm15, 2720(%r12) +vmovdqa 0(%rsp), %ymm11 +vinserti128 $1, %xmm1, %ymm11, %ymm14 +vmovdqa %ymm14, 1376(%r12) +vpermq $78, %ymm11, %ymm11 +vinserti128 $0, %xmm11, %ymm1, %ymm1 +vmovdqa %ymm1, 2912(%r12) +vmovdqa 3840(%r10), %ymm0 +vmovdqa 3904(%r10), %ymm1 +vmovdqa 3968(%r10), %ymm2 +vmovdqa 4032(%r10), %ymm3 +vpunpcklwd 3872(%r10), %ymm0, %ymm4 +vpunpckhwd 3872(%r10), %ymm0, %ymm5 +vpunpcklwd 3936(%r10), %ymm1, %ymm6 +vpunpckhwd 3936(%r10), %ymm1, %ymm7 +vpunpcklwd 4000(%r10), %ymm2, %ymm8 +vpunpckhwd 4000(%r10), %ymm2, %ymm9 +vpunpcklwd 4064(%r10), %ymm3, %ymm10 +vpunpckhwd 4064(%r10), %ymm3, %ymm11 +vpunpckldq %ymm6, %ymm4, %ymm0 +vpunpckhdq %ymm6, %ymm4, %ymm1 +vpunpckldq %ymm7, %ymm5, %ymm2 +vpunpckhdq %ymm7, %ymm5, %ymm3 +vpunpckldq %ymm10, %ymm8, %ymm12 +vpunpckhdq %ymm10, %ymm8, %ymm13 +vpunpckldq %ymm11, %ymm9, %ymm14 +vpunpckhdq %ymm11, %ymm9, %ymm15 +vpunpcklqdq %ymm12, %ymm0, %ymm4 +vpunpckhqdq %ymm12, %ymm0, %ymm5 +vpunpcklqdq %ymm13, %ymm1, %ymm6 +vpunpckhqdq %ymm13, %ymm1, %ymm7 +vpunpcklqdq %ymm14, %ymm2, %ymm8 +vpunpckhqdq %ymm14, %ymm2, %ymm9 +vpunpcklqdq %ymm15, %ymm3, %ymm10 +vpunpckhqdq %ymm15, %ymm3, %ymm11 +vmovdqa 4096(%r10), %ymm0 +vmovdqa 4160(%r10), %ymm1 +vmovdqa 4224(%r10), %ymm2 +vmovdqa 4288(%r10), %ymm3 +vpunpcklwd 4128(%r10), %ymm0, %ymm12 +vpunpckhwd 4128(%r10), %ymm0, %ymm13 +vpunpcklwd 4192(%r10), %ymm1, %ymm14 +vpunpckhwd 4192(%r10), %ymm1, %ymm15 +vpunpcklwd 4256(%r10), %ymm2, %ymm0 +vpunpckhwd 4256(%r10), %ymm2, %ymm1 +vpunpcklwd 4320(%r10), %ymm3, %ymm2 +vpunpckhwd 4320(%r10), %ymm3, %ymm3 +vmovdqa %ymm11, 0(%rsp) +vpunpckldq %ymm14, %ymm12, %ymm11 +vpunpckhdq %ymm14, %ymm12, %ymm12 +vpunpckldq %ymm15, %ymm13, %ymm14 +vpunpckhdq %ymm15, %ymm13, %ymm15 +vpunpckldq %ymm2, %ymm0, %ymm13 +vpunpckhdq %ymm2, %ymm0, %ymm0 +vpunpckldq %ymm3, %ymm1, %ymm2 +vpunpckhdq %ymm3, %ymm1, %ymm1 +vpunpcklqdq %ymm13, %ymm11, %ymm3 +vpunpckhqdq %ymm13, %ymm11, %ymm13 +vpunpcklqdq %ymm0, %ymm12, %ymm11 +vpunpckhqdq %ymm0, %ymm12, %ymm0 +vpunpcklqdq %ymm2, %ymm14, %ymm12 +vpunpckhqdq %ymm2, %ymm14, %ymm2 +vpunpcklqdq %ymm1, %ymm15, %ymm14 +vpunpckhqdq %ymm1, %ymm15, %ymm1 +vinserti128 $1, %xmm3, %ymm4, %ymm15 +vmovdqa %ymm15, 64(%r12) +vinserti128 $1, %xmm13, %ymm5, %ymm15 +vmovdqa %ymm15, 256(%r12) +vinserti128 $1, %xmm11, %ymm6, %ymm15 +vmovdqa %ymm15, 448(%r12) +vinserti128 $1, %xmm0, %ymm7, %ymm15 +vmovdqa %ymm15, 640(%r12) +vinserti128 $1, %xmm12, %ymm8, %ymm15 +vmovdqa %ymm15, 832(%r12) +vinserti128 $1, %xmm2, %ymm9, %ymm15 +vmovdqa %ymm15, 1024(%r12) +vinserti128 $1, %xmm14, %ymm10, %ymm15 +vmovdqa %ymm15, 1216(%r12) +vpermq $78, %ymm4, %ymm4 +vpermq $78, %ymm5, %ymm5 +vpermq $78, %ymm6, %ymm6 +vpermq $78, %ymm7, %ymm7 +vpermq $78, %ymm8, %ymm8 +vpermq $78, %ymm9, %ymm9 +vpermq $78, %ymm10, %ymm10 +vinserti128 $0, %xmm4, %ymm3, %ymm15 +vmovdqa %ymm15, 1600(%r12) +vinserti128 $0, %xmm5, %ymm13, %ymm15 +vmovdqa %ymm15, 1792(%r12) +vinserti128 $0, %xmm6, %ymm11, %ymm15 +vmovdqa %ymm15, 1984(%r12) +vinserti128 $0, %xmm7, %ymm0, %ymm15 +vmovdqa %ymm15, 2176(%r12) +vinserti128 $0, %xmm8, %ymm12, %ymm15 +vmovdqa %ymm15, 2368(%r12) +vinserti128 $0, %xmm9, %ymm2, %ymm15 +vmovdqa %ymm15, 2560(%r12) +vinserti128 $0, %xmm10, %ymm14, %ymm15 +vmovdqa %ymm15, 2752(%r12) +vmovdqa 0(%rsp), %ymm11 +vinserti128 $1, %xmm1, %ymm11, %ymm14 +vmovdqa %ymm14, 1408(%r12) +vpermq $78, %ymm11, %ymm11 +vinserti128 $0, %xmm11, %ymm1, %ymm1 +vmovdqa %ymm1, 2944(%r12) +vmovdqa 4224(%r10), %ymm0 +vmovdqa 4288(%r10), %ymm1 +vmovdqa 4352(%r10), %ymm2 +vmovdqa 4416(%r10), %ymm3 +vpunpcklwd 4256(%r10), %ymm0, %ymm4 +vpunpckhwd 4256(%r10), %ymm0, %ymm5 +vpunpcklwd 4320(%r10), %ymm1, %ymm6 +vpunpckhwd 4320(%r10), %ymm1, %ymm7 +vpunpcklwd 4384(%r10), %ymm2, %ymm8 +vpunpckhwd 4384(%r10), %ymm2, %ymm9 +vpunpcklwd 4448(%r10), %ymm3, %ymm10 +vpunpckhwd 4448(%r10), %ymm3, %ymm11 +vpunpckldq %ymm6, %ymm4, %ymm0 +vpunpckhdq %ymm6, %ymm4, %ymm1 +vpunpckldq %ymm7, %ymm5, %ymm2 +vpunpckhdq %ymm7, %ymm5, %ymm3 +vpunpckldq %ymm10, %ymm8, %ymm12 +vpunpckhdq %ymm10, %ymm8, %ymm13 +vpunpckldq %ymm11, %ymm9, %ymm14 +vpunpckhdq %ymm11, %ymm9, %ymm15 +vpunpcklqdq %ymm12, %ymm0, %ymm4 +vpunpckhqdq %ymm12, %ymm0, %ymm5 +vpunpcklqdq %ymm13, %ymm1, %ymm6 +vpunpckhqdq %ymm13, %ymm1, %ymm7 +vpunpcklqdq %ymm14, %ymm2, %ymm8 +vpunpckhqdq %ymm14, %ymm2, %ymm9 +vpunpcklqdq %ymm15, %ymm3, %ymm10 +vpunpckhqdq %ymm15, %ymm3, %ymm11 +vmovdqa 4480(%r10), %ymm0 +vmovdqa 4544(%r10), %ymm1 +vmovdqa 4608(%r10), %ymm2 +vmovdqa 4672(%r10), %ymm3 +vpunpcklwd 4512(%r10), %ymm0, %ymm12 +vpunpckhwd 4512(%r10), %ymm0, %ymm13 +vpunpcklwd 4576(%r10), %ymm1, %ymm14 +vpunpckhwd 4576(%r10), %ymm1, %ymm15 +vpunpcklwd 4640(%r10), %ymm2, %ymm0 +vpunpckhwd 4640(%r10), %ymm2, %ymm1 +vpunpcklwd 4704(%r10), %ymm3, %ymm2 +vpunpckhwd 4704(%r10), %ymm3, %ymm3 +vmovdqa %ymm11, 0(%rsp) +vpunpckldq %ymm14, %ymm12, %ymm11 +vpunpckhdq %ymm14, %ymm12, %ymm12 +vpunpckldq %ymm15, %ymm13, %ymm14 +vpunpckhdq %ymm15, %ymm13, %ymm15 +vpunpckldq %ymm2, %ymm0, %ymm13 +vpunpckhdq %ymm2, %ymm0, %ymm0 +vpunpckldq %ymm3, %ymm1, %ymm2 +vpunpckhdq %ymm3, %ymm1, %ymm1 +vpunpcklqdq %ymm13, %ymm11, %ymm3 +vpunpckhqdq %ymm13, %ymm11, %ymm13 +vpunpcklqdq %ymm0, %ymm12, %ymm11 +vpunpckhqdq %ymm0, %ymm12, %ymm0 +vpunpcklqdq %ymm2, %ymm14, %ymm12 +vpunpckhqdq %ymm2, %ymm14, %ymm2 +vpunpcklqdq %ymm1, %ymm15, %ymm14 +vpunpckhqdq %ymm1, %ymm15, %ymm1 +vinserti128 $1, %xmm3, %ymm4, %ymm15 +vmovdqa %ymm15, 96(%r12) +vinserti128 $1, %xmm13, %ymm5, %ymm15 +vmovdqa %ymm15, 288(%r12) +vinserti128 $1, %xmm11, %ymm6, %ymm15 +vmovdqa %ymm15, 480(%r12) +vinserti128 $1, %xmm0, %ymm7, %ymm15 +vmovdqa %ymm15, 672(%r12) +vinserti128 $1, %xmm12, %ymm8, %ymm15 +vmovdqa %ymm15, 864(%r12) +vinserti128 $1, %xmm2, %ymm9, %ymm15 +vmovdqa %ymm15, 1056(%r12) +vinserti128 $1, %xmm14, %ymm10, %ymm15 +vmovdqa %ymm15, 1248(%r12) +vpermq $78, %ymm4, %ymm4 +vpermq $78, %ymm5, %ymm5 +vpermq $78, %ymm6, %ymm6 +vpermq $78, %ymm7, %ymm7 +vpermq $78, %ymm8, %ymm8 +vpermq $78, %ymm9, %ymm9 +vpermq $78, %ymm10, %ymm10 +vinserti128 $0, %xmm4, %ymm3, %ymm15 +vmovdqa %ymm15, 1632(%r12) +vinserti128 $0, %xmm5, %ymm13, %ymm15 +vmovdqa %ymm15, 1824(%r12) +vinserti128 $0, %xmm6, %ymm11, %ymm15 +vmovdqa %ymm15, 2016(%r12) +vinserti128 $0, %xmm7, %ymm0, %ymm15 +vmovdqa %ymm15, 2208(%r12) +vinserti128 $0, %xmm8, %ymm12, %ymm15 +vmovdqa %ymm15, 2400(%r12) +vinserti128 $0, %xmm9, %ymm2, %ymm15 +vmovdqa %ymm15, 2592(%r12) +vinserti128 $0, %xmm10, %ymm14, %ymm15 +vmovdqa %ymm15, 2784(%r12) +vmovdqa 0(%rsp), %ymm11 +vinserti128 $1, %xmm1, %ymm11, %ymm14 +vmovdqa %ymm14, 1440(%r12) +vpermq $78, %ymm11, %ymm11 +vinserti128 $0, %xmm11, %ymm1, %ymm1 +vmovdqa %ymm1, 2976(%r12) +vmovdqa 4736(%r10), %ymm0 +vmovdqa 4800(%r10), %ymm1 +vmovdqa 4864(%r10), %ymm2 +vmovdqa 4928(%r10), %ymm3 +vpunpcklwd 4768(%r10), %ymm0, %ymm4 +vpunpckhwd 4768(%r10), %ymm0, %ymm5 +vpunpcklwd 4832(%r10), %ymm1, %ymm6 +vpunpckhwd 4832(%r10), %ymm1, %ymm7 +vpunpcklwd 4896(%r10), %ymm2, %ymm8 +vpunpckhwd 4896(%r10), %ymm2, %ymm9 +vpunpcklwd 4960(%r10), %ymm3, %ymm10 +vpunpckhwd 4960(%r10), %ymm3, %ymm11 +vpunpckldq %ymm6, %ymm4, %ymm0 +vpunpckhdq %ymm6, %ymm4, %ymm1 +vpunpckldq %ymm7, %ymm5, %ymm2 +vpunpckhdq %ymm7, %ymm5, %ymm3 +vpunpckldq %ymm10, %ymm8, %ymm12 +vpunpckhdq %ymm10, %ymm8, %ymm13 +vpunpckldq %ymm11, %ymm9, %ymm14 +vpunpckhdq %ymm11, %ymm9, %ymm15 +vpunpcklqdq %ymm12, %ymm0, %ymm4 +vpunpckhqdq %ymm12, %ymm0, %ymm5 +vpunpcklqdq %ymm13, %ymm1, %ymm6 +vpunpckhqdq %ymm13, %ymm1, %ymm7 +vpunpcklqdq %ymm14, %ymm2, %ymm8 +vpunpckhqdq %ymm14, %ymm2, %ymm9 +vpunpcklqdq %ymm15, %ymm3, %ymm10 +vpunpckhqdq %ymm15, %ymm3, %ymm11 +vmovdqa 4992(%r10), %ymm0 +vmovdqa 5056(%r10), %ymm1 +vmovdqa 5120(%r10), %ymm2 +vmovdqa 5184(%r10), %ymm3 +vpunpcklwd 5024(%r10), %ymm0, %ymm12 +vpunpckhwd 5024(%r10), %ymm0, %ymm13 +vpunpcklwd 5088(%r10), %ymm1, %ymm14 +vpunpckhwd 5088(%r10), %ymm1, %ymm15 +vpunpcklwd 5152(%r10), %ymm2, %ymm0 +vpunpckhwd 5152(%r10), %ymm2, %ymm1 +vpunpcklwd 5216(%r10), %ymm3, %ymm2 +vpunpckhwd 5216(%r10), %ymm3, %ymm3 +vmovdqa %ymm11, 0(%rsp) +vpunpckldq %ymm14, %ymm12, %ymm11 +vpunpckhdq %ymm14, %ymm12, %ymm12 +vpunpckldq %ymm15, %ymm13, %ymm14 +vpunpckhdq %ymm15, %ymm13, %ymm15 +vpunpckldq %ymm2, %ymm0, %ymm13 +vpunpckhdq %ymm2, %ymm0, %ymm0 +vpunpckldq %ymm3, %ymm1, %ymm2 +vpunpckhdq %ymm3, %ymm1, %ymm1 +vpunpcklqdq %ymm13, %ymm11, %ymm3 +vpunpckhqdq %ymm13, %ymm11, %ymm13 +vpunpcklqdq %ymm0, %ymm12, %ymm11 +vpunpckhqdq %ymm0, %ymm12, %ymm0 +vpunpcklqdq %ymm2, %ymm14, %ymm12 +vpunpckhqdq %ymm2, %ymm14, %ymm2 +vpunpcklqdq %ymm1, %ymm15, %ymm14 +vpunpckhqdq %ymm1, %ymm15, %ymm1 +vinserti128 $1, %xmm3, %ymm4, %ymm15 +vmovdqa %ymm15, 128(%r12) +vinserti128 $1, %xmm13, %ymm5, %ymm15 +vmovdqa %ymm15, 320(%r12) +vinserti128 $1, %xmm11, %ymm6, %ymm15 +vmovdqa %ymm15, 512(%r12) +vinserti128 $1, %xmm0, %ymm7, %ymm15 +vmovdqa %ymm15, 704(%r12) +vinserti128 $1, %xmm12, %ymm8, %ymm15 +vmovdqa %ymm15, 896(%r12) +vinserti128 $1, %xmm2, %ymm9, %ymm15 +vmovdqa %ymm15, 1088(%r12) +vinserti128 $1, %xmm14, %ymm10, %ymm15 +vmovdqa %ymm15, 1280(%r12) +vpermq $78, %ymm4, %ymm4 +vpermq $78, %ymm5, %ymm5 +vpermq $78, %ymm6, %ymm6 +vpermq $78, %ymm7, %ymm7 +vpermq $78, %ymm8, %ymm8 +vpermq $78, %ymm9, %ymm9 +vpermq $78, %ymm10, %ymm10 +vinserti128 $0, %xmm4, %ymm3, %ymm15 +vmovdqa %ymm15, 1664(%r12) +vinserti128 $0, %xmm5, %ymm13, %ymm15 +vmovdqa %ymm15, 1856(%r12) +vinserti128 $0, %xmm6, %ymm11, %ymm15 +vmovdqa %ymm15, 2048(%r12) +vinserti128 $0, %xmm7, %ymm0, %ymm15 +vmovdqa %ymm15, 2240(%r12) +vinserti128 $0, %xmm8, %ymm12, %ymm15 +vmovdqa %ymm15, 2432(%r12) +vinserti128 $0, %xmm9, %ymm2, %ymm15 +vmovdqa %ymm15, 2624(%r12) +vinserti128 $0, %xmm10, %ymm14, %ymm15 +vmovdqa %ymm15, 2816(%r12) +vmovdqa 0(%rsp), %ymm11 +vinserti128 $1, %xmm1, %ymm11, %ymm14 +vmovdqa %ymm14, 1472(%r12) +vpermq $78, %ymm11, %ymm11 +vinserti128 $0, %xmm11, %ymm1, %ymm1 +vmovdqa %ymm1, 3008(%r12) +vmovdqa 5248(%r10), %ymm0 +vmovdqa 5312(%r10), %ymm1 +vmovdqa 5376(%r10), %ymm2 +vmovdqa 5440(%r10), %ymm3 +vpunpcklwd 5280(%r10), %ymm0, %ymm4 +vpunpckhwd 5280(%r10), %ymm0, %ymm5 +vpunpcklwd 5344(%r10), %ymm1, %ymm6 +vpunpckhwd 5344(%r10), %ymm1, %ymm7 +vpunpcklwd 5408(%r10), %ymm2, %ymm8 +vpunpckhwd 5408(%r10), %ymm2, %ymm9 +vpunpcklwd 5472(%r10), %ymm3, %ymm10 +vpunpckhwd 5472(%r10), %ymm3, %ymm11 +vpunpckldq %ymm6, %ymm4, %ymm0 +vpunpckhdq %ymm6, %ymm4, %ymm1 +vpunpckldq %ymm7, %ymm5, %ymm2 +vpunpckhdq %ymm7, %ymm5, %ymm3 +vpunpckldq %ymm10, %ymm8, %ymm12 +vpunpckhdq %ymm10, %ymm8, %ymm13 +vpunpckldq %ymm11, %ymm9, %ymm14 +vpunpckhdq %ymm11, %ymm9, %ymm15 +vpunpcklqdq %ymm12, %ymm0, %ymm4 +vpunpckhqdq %ymm12, %ymm0, %ymm5 +vpunpcklqdq %ymm13, %ymm1, %ymm6 +vpunpckhqdq %ymm13, %ymm1, %ymm7 +vpunpcklqdq %ymm14, %ymm2, %ymm8 +vpunpckhqdq %ymm14, %ymm2, %ymm9 +vpunpcklqdq %ymm15, %ymm3, %ymm10 +vpunpckhqdq %ymm15, %ymm3, %ymm11 +vmovdqa 5504(%r10), %ymm0 +vmovdqa 5568(%r10), %ymm1 +vmovdqa 5632(%r10), %ymm2 +vmovdqa 5696(%r10), %ymm3 +vpunpcklwd 5536(%r10), %ymm0, %ymm12 +vpunpckhwd 5536(%r10), %ymm0, %ymm13 +vpunpcklwd 5600(%r10), %ymm1, %ymm14 +vpunpckhwd 5600(%r10), %ymm1, %ymm15 +vpunpcklwd 5664(%r10), %ymm2, %ymm0 +vpunpckhwd 5664(%r10), %ymm2, %ymm1 +vpunpcklwd 5728(%r10), %ymm3, %ymm2 +vpunpckhwd 5728(%r10), %ymm3, %ymm3 +vmovdqa %ymm11, 0(%rsp) +vpunpckldq %ymm14, %ymm12, %ymm11 +vpunpckhdq %ymm14, %ymm12, %ymm12 +vpunpckldq %ymm15, %ymm13, %ymm14 +vpunpckhdq %ymm15, %ymm13, %ymm15 +vpunpckldq %ymm2, %ymm0, %ymm13 +vpunpckhdq %ymm2, %ymm0, %ymm0 +vpunpckldq %ymm3, %ymm1, %ymm2 +vpunpckhdq %ymm3, %ymm1, %ymm1 +vpunpcklqdq %ymm13, %ymm11, %ymm3 +vpunpckhqdq %ymm13, %ymm11, %ymm13 +vpunpcklqdq %ymm0, %ymm12, %ymm11 +vpunpckhqdq %ymm0, %ymm12, %ymm0 +vpunpcklqdq %ymm2, %ymm14, %ymm12 +vpunpckhqdq %ymm2, %ymm14, %ymm2 +vpunpcklqdq %ymm1, %ymm15, %ymm14 +vpunpckhqdq %ymm1, %ymm15, %ymm1 +vinserti128 $1, %xmm3, %ymm4, %ymm15 +vmovdqa %ymm15, 160(%r12) +vinserti128 $1, %xmm13, %ymm5, %ymm15 +vmovdqa %ymm15, 352(%r12) +vinserti128 $1, %xmm11, %ymm6, %ymm15 +vmovdqa %ymm15, 544(%r12) +vinserti128 $1, %xmm0, %ymm7, %ymm15 +vmovdqa %ymm15, 736(%r12) +vinserti128 $1, %xmm12, %ymm8, %ymm15 +vmovdqa %ymm15, 928(%r12) +vinserti128 $1, %xmm2, %ymm9, %ymm15 +vmovdqa %ymm15, 1120(%r12) +vinserti128 $1, %xmm14, %ymm10, %ymm15 +vmovdqa %ymm15, 1312(%r12) +vpermq $78, %ymm4, %ymm4 +vpermq $78, %ymm5, %ymm5 +vpermq $78, %ymm6, %ymm6 +vpermq $78, %ymm7, %ymm7 +vpermq $78, %ymm8, %ymm8 +vpermq $78, %ymm9, %ymm9 +vpermq $78, %ymm10, %ymm10 +vinserti128 $0, %xmm4, %ymm3, %ymm15 +vmovdqa %ymm15, 1696(%r12) +vinserti128 $0, %xmm5, %ymm13, %ymm15 +vmovdqa %ymm15, 1888(%r12) +vinserti128 $0, %xmm6, %ymm11, %ymm15 +vmovdqa %ymm15, 2080(%r12) +vinserti128 $0, %xmm7, %ymm0, %ymm15 +vmovdqa %ymm15, 2272(%r12) +vinserti128 $0, %xmm8, %ymm12, %ymm15 +vmovdqa %ymm15, 2464(%r12) +vinserti128 $0, %xmm9, %ymm2, %ymm15 +vmovdqa %ymm15, 2656(%r12) +vinserti128 $0, %xmm10, %ymm14, %ymm15 +vmovdqa %ymm15, 2848(%r12) +vmovdqa 0(%rsp), %ymm11 +vinserti128 $1, %xmm1, %ymm11, %ymm14 +vmovdqa %ymm14, 1504(%r12) +vpermq $78, %ymm11, %ymm11 +vinserti128 $0, %xmm11, %ymm1, %ymm1 +vmovdqa %ymm1, 3040(%r12) +addq $32, %rsp +add $1536, %rax +add $1536, %r11 +add $3072, %r12 +dec %ecx +jnz karatsuba_loop_4eced63f144beffcb0247f9c6f67d165 +sub $12288, %r12 +add $9408, %rsp +subq $2400, %rsp +vpxor %ymm0, %ymm0, %ymm0 +vmovdqa %ymm0, 1792(%rsp) +vmovdqa %ymm0, 1824(%rsp) +vmovdqa %ymm0, 1856(%rsp) +vmovdqa %ymm0, 1888(%rsp) +vmovdqa %ymm0, 1920(%rsp) +vmovdqa %ymm0, 1952(%rsp) +vmovdqa %ymm0, 1984(%rsp) +vmovdqa %ymm0, 2016(%rsp) +vmovdqa %ymm0, 2048(%rsp) +vmovdqa %ymm0, 2080(%rsp) +vmovdqa %ymm0, 2112(%rsp) +vmovdqa %ymm0, 2144(%rsp) +vmovdqa %ymm0, 2176(%rsp) +vmovdqa %ymm0, 2208(%rsp) +vmovdqa %ymm0, 2240(%rsp) +vmovdqa %ymm0, 2272(%rsp) +vmovdqa %ymm0, 2304(%rsp) +vmovdqa %ymm0, 2336(%rsp) +vmovdqa %ymm0, 2368(%rsp) +vmovdqa %ymm0, 2400(%rsp) +vmovdqa %ymm0, 2432(%rsp) +vmovdqa %ymm0, 2464(%rsp) +vmovdqa %ymm0, 2496(%rsp) +vmovdqa %ymm0, 2528(%rsp) +vmovdqa %ymm0, 2560(%rsp) +vmovdqa %ymm0, 2592(%rsp) +vmovdqa %ymm0, 2624(%rsp) +vmovdqa %ymm0, 2656(%rsp) +vmovdqa %ymm0, 2688(%rsp) +vmovdqa %ymm0, 2720(%rsp) +vmovdqa %ymm0, 2752(%rsp) +vmovdqa %ymm0, 2784(%rsp) +vmovdqa const729(%rip), %ymm15 +vmovdqa const3_inv(%rip), %ymm14 +vmovdqa const5_inv(%rip), %ymm13 +vmovdqa const9(%rip), %ymm12 +vmovdqa 96(%r12), %ymm0 +vpsubw 192(%r12), %ymm0, %ymm0 +vmovdqa 480(%r12), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 288(%r12), %ymm1, %ymm1 +vpsubw 0(%r12), %ymm0, %ymm0 +vpaddw 384(%r12), %ymm0, %ymm0 +vmovdqa 672(%r12), %ymm2 +vpsubw 768(%r12), %ymm2, %ymm2 +vmovdqa 1056(%r12), %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw 864(%r12), %ymm3, %ymm3 +vpsubw 576(%r12), %ymm2, %ymm2 +vpaddw 960(%r12), %ymm2, %ymm2 +vmovdqa 1248(%r12), %ymm4 +vpsubw 1344(%r12), %ymm4, %ymm4 +vmovdqa 1632(%r12), %ymm5 +vpsubw %ymm4, %ymm5, %ymm5 +vpsubw 1440(%r12), %ymm5, %ymm5 +vpsubw 1152(%r12), %ymm4, %ymm4 +vpaddw 1536(%r12), %ymm4, %ymm4 +vpsubw 576(%r12), %ymm1, %ymm1 +vpsubw %ymm1, %ymm5, %ymm5 +vpsubw %ymm3, %ymm5, %ymm5 +vpsubw 0(%r12), %ymm1, %ymm1 +vpaddw 1152(%r12), %ymm1, %ymm1 +vmovdqa 288(%r12), %ymm6 +vpsubw %ymm2, %ymm6, %ymm7 +vmovdqa 1440(%r12), %ymm2 +vpsubw %ymm7, %ymm2, %ymm2 +vpsubw 864(%r12), %ymm2, %ymm2 +vpsubw %ymm0, %ymm7, %ymm7 +vpaddw %ymm4, %ymm7, %ymm7 +vmovdqa 0(%r12), %ymm8 +vmovdqa 864(%r12), %ymm9 +vmovdqa %ymm8, 0(%rsp) +vmovdqa %ymm0, 32(%rsp) +vmovdqa %ymm1, 64(%rsp) +vmovdqa %ymm7, 96(%rsp) +vmovdqa %ymm5, 128(%rsp) +vmovdqa %ymm2, 160(%rsp) +vmovdqa %ymm3, 192(%rsp) +vmovdqa %ymm9, 224(%rsp) +vmovdqa 1824(%r12), %ymm0 +vpsubw 1920(%r12), %ymm0, %ymm0 +vmovdqa 2208(%r12), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 2016(%r12), %ymm1, %ymm1 +vpsubw 1728(%r12), %ymm0, %ymm0 +vpaddw 2112(%r12), %ymm0, %ymm0 +vmovdqa 2400(%r12), %ymm2 +vpsubw 2496(%r12), %ymm2, %ymm2 +vmovdqa 2784(%r12), %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw 2592(%r12), %ymm3, %ymm3 +vpsubw 2304(%r12), %ymm2, %ymm2 +vpaddw 2688(%r12), %ymm2, %ymm2 +vmovdqa 2976(%r12), %ymm4 +vpsubw 3072(%r12), %ymm4, %ymm4 +vmovdqa 3360(%r12), %ymm5 +vpsubw %ymm4, %ymm5, %ymm5 +vpsubw 3168(%r12), %ymm5, %ymm5 +vpsubw 2880(%r12), %ymm4, %ymm4 +vpaddw 3264(%r12), %ymm4, %ymm4 +vpsubw 2304(%r12), %ymm1, %ymm1 +vpsubw %ymm1, %ymm5, %ymm5 +vpsubw %ymm3, %ymm5, %ymm5 +vpsubw 1728(%r12), %ymm1, %ymm1 +vpaddw 2880(%r12), %ymm1, %ymm1 +vmovdqa 2016(%r12), %ymm6 +vpsubw %ymm2, %ymm6, %ymm7 +vmovdqa 3168(%r12), %ymm2 +vpsubw %ymm7, %ymm2, %ymm2 +vpsubw 2592(%r12), %ymm2, %ymm2 +vpsubw %ymm0, %ymm7, %ymm7 +vpaddw %ymm4, %ymm7, %ymm7 +vmovdqa 1728(%r12), %ymm8 +vmovdqa 2592(%r12), %ymm9 +vmovdqa %ymm8, 256(%rsp) +vmovdqa %ymm0, 288(%rsp) +vmovdqa %ymm1, 320(%rsp) +vmovdqa %ymm7, 352(%rsp) +vmovdqa %ymm5, 384(%rsp) +vmovdqa %ymm2, 416(%rsp) +vmovdqa %ymm3, 448(%rsp) +vmovdqa %ymm9, 480(%rsp) +vmovdqa 3552(%r12), %ymm0 +vpsubw 3648(%r12), %ymm0, %ymm0 +vmovdqa 3936(%r12), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 3744(%r12), %ymm1, %ymm1 +vpsubw 3456(%r12), %ymm0, %ymm0 +vpaddw 3840(%r12), %ymm0, %ymm0 +vmovdqa 4128(%r12), %ymm2 +vpsubw 4224(%r12), %ymm2, %ymm2 +vmovdqa 4512(%r12), %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw 4320(%r12), %ymm3, %ymm3 +vpsubw 4032(%r12), %ymm2, %ymm2 +vpaddw 4416(%r12), %ymm2, %ymm2 +vmovdqa 4704(%r12), %ymm4 +vpsubw 4800(%r12), %ymm4, %ymm4 +vmovdqa 5088(%r12), %ymm5 +vpsubw %ymm4, %ymm5, %ymm5 +vpsubw 4896(%r12), %ymm5, %ymm5 +vpsubw 4608(%r12), %ymm4, %ymm4 +vpaddw 4992(%r12), %ymm4, %ymm4 +vpsubw 4032(%r12), %ymm1, %ymm1 +vpsubw %ymm1, %ymm5, %ymm5 +vpsubw %ymm3, %ymm5, %ymm5 +vpsubw 3456(%r12), %ymm1, %ymm1 +vpaddw 4608(%r12), %ymm1, %ymm1 +vmovdqa 3744(%r12), %ymm6 +vpsubw %ymm2, %ymm6, %ymm7 +vmovdqa 4896(%r12), %ymm2 +vpsubw %ymm7, %ymm2, %ymm2 +vpsubw 4320(%r12), %ymm2, %ymm2 +vpsubw %ymm0, %ymm7, %ymm7 +vpaddw %ymm4, %ymm7, %ymm7 +vmovdqa 3456(%r12), %ymm8 +vmovdqa 4320(%r12), %ymm9 +vmovdqa %ymm8, 512(%rsp) +vmovdqa %ymm0, 544(%rsp) +vmovdqa %ymm1, 576(%rsp) +vmovdqa %ymm7, 608(%rsp) +vmovdqa %ymm5, 640(%rsp) +vmovdqa %ymm2, 672(%rsp) +vmovdqa %ymm3, 704(%rsp) +vmovdqa %ymm9, 736(%rsp) +vmovdqa 5280(%r12), %ymm0 +vpsubw 5376(%r12), %ymm0, %ymm0 +vmovdqa 5664(%r12), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 5472(%r12), %ymm1, %ymm1 +vpsubw 5184(%r12), %ymm0, %ymm0 +vpaddw 5568(%r12), %ymm0, %ymm0 +vmovdqa 5856(%r12), %ymm2 +vpsubw 5952(%r12), %ymm2, %ymm2 +vmovdqa 6240(%r12), %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw 6048(%r12), %ymm3, %ymm3 +vpsubw 5760(%r12), %ymm2, %ymm2 +vpaddw 6144(%r12), %ymm2, %ymm2 +vmovdqa 6432(%r12), %ymm4 +vpsubw 6528(%r12), %ymm4, %ymm4 +vmovdqa 6816(%r12), %ymm5 +vpsubw %ymm4, %ymm5, %ymm5 +vpsubw 6624(%r12), %ymm5, %ymm5 +vpsubw 6336(%r12), %ymm4, %ymm4 +vpaddw 6720(%r12), %ymm4, %ymm4 +vpsubw 5760(%r12), %ymm1, %ymm1 +vpsubw %ymm1, %ymm5, %ymm5 +vpsubw %ymm3, %ymm5, %ymm5 +vpsubw 5184(%r12), %ymm1, %ymm1 +vpaddw 6336(%r12), %ymm1, %ymm1 +vmovdqa 5472(%r12), %ymm6 +vpsubw %ymm2, %ymm6, %ymm7 +vmovdqa 6624(%r12), %ymm2 +vpsubw %ymm7, %ymm2, %ymm2 +vpsubw 6048(%r12), %ymm2, %ymm2 +vpsubw %ymm0, %ymm7, %ymm7 +vpaddw %ymm4, %ymm7, %ymm7 +vmovdqa 5184(%r12), %ymm8 +vmovdqa 6048(%r12), %ymm9 +vmovdqa %ymm8, 768(%rsp) +vmovdqa %ymm0, 800(%rsp) +vmovdqa %ymm1, 832(%rsp) +vmovdqa %ymm7, 864(%rsp) +vmovdqa %ymm5, 896(%rsp) +vmovdqa %ymm2, 928(%rsp) +vmovdqa %ymm3, 960(%rsp) +vmovdqa %ymm9, 992(%rsp) +vmovdqa 7008(%r12), %ymm0 +vpsubw 7104(%r12), %ymm0, %ymm0 +vmovdqa 7392(%r12), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 7200(%r12), %ymm1, %ymm1 +vpsubw 6912(%r12), %ymm0, %ymm0 +vpaddw 7296(%r12), %ymm0, %ymm0 +vmovdqa 7584(%r12), %ymm2 +vpsubw 7680(%r12), %ymm2, %ymm2 +vmovdqa 7968(%r12), %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw 7776(%r12), %ymm3, %ymm3 +vpsubw 7488(%r12), %ymm2, %ymm2 +vpaddw 7872(%r12), %ymm2, %ymm2 +vmovdqa 8160(%r12), %ymm4 +vpsubw 8256(%r12), %ymm4, %ymm4 +vmovdqa 8544(%r12), %ymm5 +vpsubw %ymm4, %ymm5, %ymm5 +vpsubw 8352(%r12), %ymm5, %ymm5 +vpsubw 8064(%r12), %ymm4, %ymm4 +vpaddw 8448(%r12), %ymm4, %ymm4 +vpsubw 7488(%r12), %ymm1, %ymm1 +vpsubw %ymm1, %ymm5, %ymm5 +vpsubw %ymm3, %ymm5, %ymm5 +vpsubw 6912(%r12), %ymm1, %ymm1 +vpaddw 8064(%r12), %ymm1, %ymm1 +vmovdqa 7200(%r12), %ymm6 +vpsubw %ymm2, %ymm6, %ymm7 +vmovdqa 8352(%r12), %ymm2 +vpsubw %ymm7, %ymm2, %ymm2 +vpsubw 7776(%r12), %ymm2, %ymm2 +vpsubw %ymm0, %ymm7, %ymm7 +vpaddw %ymm4, %ymm7, %ymm7 +vmovdqa 6912(%r12), %ymm8 +vmovdqa 7776(%r12), %ymm9 +vmovdqa %ymm8, 1024(%rsp) +vmovdqa %ymm0, 1056(%rsp) +vmovdqa %ymm1, 1088(%rsp) +vmovdqa %ymm7, 1120(%rsp) +vmovdqa %ymm5, 1152(%rsp) +vmovdqa %ymm2, 1184(%rsp) +vmovdqa %ymm3, 1216(%rsp) +vmovdqa %ymm9, 1248(%rsp) +vmovdqa 8736(%r12), %ymm0 +vpsubw 8832(%r12), %ymm0, %ymm0 +vmovdqa 9120(%r12), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 8928(%r12), %ymm1, %ymm1 +vpsubw 8640(%r12), %ymm0, %ymm0 +vpaddw 9024(%r12), %ymm0, %ymm0 +vmovdqa 9312(%r12), %ymm2 +vpsubw 9408(%r12), %ymm2, %ymm2 +vmovdqa 9696(%r12), %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw 9504(%r12), %ymm3, %ymm3 +vpsubw 9216(%r12), %ymm2, %ymm2 +vpaddw 9600(%r12), %ymm2, %ymm2 +vmovdqa 9888(%r12), %ymm4 +vpsubw 9984(%r12), %ymm4, %ymm4 +vmovdqa 10272(%r12), %ymm5 +vpsubw %ymm4, %ymm5, %ymm5 +vpsubw 10080(%r12), %ymm5, %ymm5 +vpsubw 9792(%r12), %ymm4, %ymm4 +vpaddw 10176(%r12), %ymm4, %ymm4 +vpsubw 9216(%r12), %ymm1, %ymm1 +vpsubw %ymm1, %ymm5, %ymm5 +vpsubw %ymm3, %ymm5, %ymm5 +vpsubw 8640(%r12), %ymm1, %ymm1 +vpaddw 9792(%r12), %ymm1, %ymm1 +vmovdqa 8928(%r12), %ymm6 +vpsubw %ymm2, %ymm6, %ymm7 +vmovdqa 10080(%r12), %ymm2 +vpsubw %ymm7, %ymm2, %ymm2 +vpsubw 9504(%r12), %ymm2, %ymm2 +vpsubw %ymm0, %ymm7, %ymm7 +vpaddw %ymm4, %ymm7, %ymm7 +vmovdqa 8640(%r12), %ymm8 +vmovdqa 9504(%r12), %ymm9 +vmovdqa %ymm8, 1280(%rsp) +vmovdqa %ymm0, 1312(%rsp) +vmovdqa %ymm1, 1344(%rsp) +vmovdqa %ymm7, 1376(%rsp) +vmovdqa %ymm5, 1408(%rsp) +vmovdqa %ymm2, 1440(%rsp) +vmovdqa %ymm3, 1472(%rsp) +vmovdqa %ymm9, 1504(%rsp) +vmovdqa 10464(%r12), %ymm0 +vpsubw 10560(%r12), %ymm0, %ymm0 +vmovdqa 10848(%r12), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 10656(%r12), %ymm1, %ymm1 +vpsubw 10368(%r12), %ymm0, %ymm0 +vpaddw 10752(%r12), %ymm0, %ymm0 +vmovdqa 11040(%r12), %ymm2 +vpsubw 11136(%r12), %ymm2, %ymm2 +vmovdqa 11424(%r12), %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw 11232(%r12), %ymm3, %ymm3 +vpsubw 10944(%r12), %ymm2, %ymm2 +vpaddw 11328(%r12), %ymm2, %ymm2 +vmovdqa 11616(%r12), %ymm4 +vpsubw 11712(%r12), %ymm4, %ymm4 +vmovdqa 12000(%r12), %ymm5 +vpsubw %ymm4, %ymm5, %ymm5 +vpsubw 11808(%r12), %ymm5, %ymm5 +vpsubw 11520(%r12), %ymm4, %ymm4 +vpaddw 11904(%r12), %ymm4, %ymm4 +vpsubw 10944(%r12), %ymm1, %ymm1 +vpsubw %ymm1, %ymm5, %ymm5 +vpsubw %ymm3, %ymm5, %ymm5 +vpsubw 10368(%r12), %ymm1, %ymm1 +vpaddw 11520(%r12), %ymm1, %ymm1 +vmovdqa 10656(%r12), %ymm6 +vpsubw %ymm2, %ymm6, %ymm7 +vmovdqa 11808(%r12), %ymm2 +vpsubw %ymm7, %ymm2, %ymm2 +vpsubw 11232(%r12), %ymm2, %ymm2 +vpsubw %ymm0, %ymm7, %ymm7 +vpaddw %ymm4, %ymm7, %ymm7 +vmovdqa 10368(%r12), %ymm8 +vmovdqa 11232(%r12), %ymm9 +vmovdqa %ymm8, 1536(%rsp) +vmovdqa %ymm0, 1568(%rsp) +vmovdqa %ymm1, 1600(%rsp) +vmovdqa %ymm7, 1632(%rsp) +vmovdqa %ymm5, 1664(%rsp) +vmovdqa %ymm2, 1696(%rsp) +vmovdqa %ymm3, 1728(%rsp) +vmovdqa %ymm9, 1760(%rsp) +vmovdqa 0(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm10 +vpunpckhwd const0(%rip), %ymm11, %ymm9 +vpslld $1, %ymm10, %ymm10 +vpslld $1, %ymm9, %ymm9 +vmovdqa 256(%rsp), %ymm8 +vpunpcklwd const0(%rip), %ymm8, %ymm7 +vpunpckhwd const0(%rip), %ymm8, %ymm8 +vmovdqa 512(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm5 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm5, %ymm7, %ymm4 +vpaddd %ymm6, %ymm8, %ymm3 +vpsubd %ymm10, %ymm4, %ymm4 +vpsubd %ymm9, %ymm3, %ymm3 +vpsubd %ymm5, %ymm7, %ymm5 +vpsubd %ymm6, %ymm8, %ymm6 +vpsrld $1, %ymm5, %ymm5 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm5, %ymm5 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm5, %ymm6 +vmovdqa 1536(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm8 +vpunpckhwd const0(%rip), %ymm5, %ymm7 +vpslld $1, %ymm8, %ymm8 +vpslld $1, %ymm7, %ymm7 +vpsubd %ymm8, %ymm4, %ymm4 +vpsubd %ymm7, %ymm3, %ymm3 +vpsrld $1, %ymm4, %ymm4 +vpsrld $1, %ymm3, %ymm3 +vpand mask32_to_16(%rip), %ymm4, %ymm4 +vpand mask32_to_16(%rip), %ymm3, %ymm3 +vpackusdw %ymm3, %ymm4, %ymm3 +vmovdqa 768(%rsp), %ymm4 +vpaddw 1024(%rsp), %ymm4, %ymm7 +vpsubw 1024(%rsp), %ymm4, %ymm4 +vpsrlw $2, %ymm4, %ymm4 +vpsubw %ymm6, %ymm4, %ymm4 +vpmullw %ymm14, %ymm4, %ymm4 +vpsllw $1, %ymm11, %ymm8 +vpsubw %ymm8, %ymm7, %ymm8 +vpsllw $7, %ymm5, %ymm7 +vpsubw %ymm7, %ymm8, %ymm7 +vpsrlw $3, %ymm7, %ymm7 +vpsubw %ymm3, %ymm7, %ymm7 +vmovdqa 1280(%rsp), %ymm8 +vpsubw %ymm11, %ymm8, %ymm8 +vpmullw %ymm15, %ymm5, %ymm9 +vpsubw %ymm9, %ymm8, %ymm9 +vpmullw %ymm14, %ymm7, %ymm7 +vpsubw %ymm7, %ymm3, %ymm3 +vpmullw %ymm12, %ymm7, %ymm8 +vpaddw %ymm8, %ymm3, %ymm8 +vpmullw %ymm12, %ymm8, %ymm8 +vpsubw %ymm8, %ymm9, %ymm8 +vpmullw %ymm14, %ymm8, %ymm8 +vpsubw %ymm6, %ymm8, %ymm8 +vpsrlw $3, %ymm8, %ymm8 +vpsubw %ymm4, %ymm8, %ymm8 +vpsubw %ymm8, %ymm4, %ymm4 +vpsubw %ymm4, %ymm6, %ymm6 +vpmullw %ymm13, %ymm8, %ymm8 +vpsubw %ymm8, %ymm6, %ymm6 +vpshufb shuf48_16(%rip), %ymm7, %ymm7 +vpand mask3_5_3_5(%rip), %ymm7, %ymm9 +vpand mask5_3_5_3(%rip), %ymm7, %ymm7 +vpermq $206, %ymm9, %ymm9 +vpand mask_keephigh(%rip), %ymm9, %ymm10 +vpor %ymm10, %ymm7, %ymm7 +vpaddw %ymm7, %ymm11, %ymm11 +vmovdqa %xmm9, 2048(%rsp) +vpshufb shuf48_16(%rip), %ymm8, %ymm8 +vpand mask3_5_3_5(%rip), %ymm8, %ymm9 +vpand mask5_3_5_3(%rip), %ymm8, %ymm8 +vpermq $206, %ymm9, %ymm9 +vpand mask_keephigh(%rip), %ymm9, %ymm10 +vpor %ymm10, %ymm8, %ymm8 +vpaddw %ymm8, %ymm6, %ymm6 +vmovdqa %xmm9, 2304(%rsp) +vpshufb shuf48_16(%rip), %ymm5, %ymm5 +vpand mask3_5_3_5(%rip), %ymm5, %ymm9 +vpand mask5_3_5_3(%rip), %ymm5, %ymm5 +vpermq $206, %ymm9, %ymm9 +vpand mask_keephigh(%rip), %ymm9, %ymm10 +vpor %ymm10, %ymm5, %ymm5 +vpaddw %ymm5, %ymm3, %ymm3 +vmovdqa %xmm9, 2560(%rsp) +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 0(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %ymm6, 352(%rdi) +vpand mask_mod8192(%rip), %ymm3, %ymm3 +vmovdqu %ymm3, 704(%rdi) +vpand mask_mod8192(%rip), %ymm4, %ymm4 +vmovdqu %ymm4, 1056(%rdi) +vmovdqa 32(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm8 +vpunpckhwd const0(%rip), %ymm5, %ymm7 +vpslld $1, %ymm8, %ymm8 +vpslld $1, %ymm7, %ymm7 +vmovdqa 288(%rsp), %ymm4 +vpunpcklwd const0(%rip), %ymm4, %ymm3 +vpunpckhwd const0(%rip), %ymm4, %ymm4 +vmovdqa 544(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm11 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm11, %ymm3, %ymm9 +vpaddd %ymm6, %ymm4, %ymm10 +vpsubd %ymm8, %ymm9, %ymm9 +vpsubd %ymm7, %ymm10, %ymm10 +vpsubd %ymm11, %ymm3, %ymm11 +vpsubd %ymm6, %ymm4, %ymm6 +vpsrld $1, %ymm11, %ymm11 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm11, %ymm11 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm11, %ymm6 +vmovdqa 1568(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm4 +vpunpckhwd const0(%rip), %ymm11, %ymm3 +vpslld $1, %ymm4, %ymm4 +vpslld $1, %ymm3, %ymm3 +vpsubd %ymm4, %ymm9, %ymm9 +vpsubd %ymm3, %ymm10, %ymm10 +vpsrld $1, %ymm9, %ymm9 +vpsrld $1, %ymm10, %ymm10 +vpand mask32_to_16(%rip), %ymm9, %ymm9 +vpand mask32_to_16(%rip), %ymm10, %ymm10 +vpackusdw %ymm10, %ymm9, %ymm10 +vmovdqa 800(%rsp), %ymm9 +vpaddw 1056(%rsp), %ymm9, %ymm3 +vpsubw 1056(%rsp), %ymm9, %ymm9 +vpsrlw $2, %ymm9, %ymm9 +vpsubw %ymm6, %ymm9, %ymm9 +vpmullw %ymm14, %ymm9, %ymm9 +vpsllw $1, %ymm5, %ymm4 +vpsubw %ymm4, %ymm3, %ymm4 +vpsllw $7, %ymm11, %ymm3 +vpsubw %ymm3, %ymm4, %ymm3 +vpsrlw $3, %ymm3, %ymm3 +vpsubw %ymm10, %ymm3, %ymm3 +vmovdqa 1312(%rsp), %ymm4 +vpsubw %ymm5, %ymm4, %ymm4 +vpmullw %ymm15, %ymm11, %ymm7 +vpsubw %ymm7, %ymm4, %ymm7 +vpmullw %ymm14, %ymm3, %ymm3 +vpsubw %ymm3, %ymm10, %ymm10 +vpmullw %ymm12, %ymm3, %ymm4 +vpaddw %ymm4, %ymm10, %ymm4 +vpmullw %ymm12, %ymm4, %ymm4 +vpsubw %ymm4, %ymm7, %ymm4 +vpmullw %ymm14, %ymm4, %ymm4 +vpsubw %ymm6, %ymm4, %ymm4 +vpsrlw $3, %ymm4, %ymm4 +vpsubw %ymm9, %ymm4, %ymm4 +vpsubw %ymm4, %ymm9, %ymm9 +vpsubw %ymm9, %ymm6, %ymm6 +vpmullw %ymm13, %ymm4, %ymm4 +vpsubw %ymm4, %ymm6, %ymm6 +vpshufb shuf48_16(%rip), %ymm3, %ymm3 +vpand mask3_5_3_5(%rip), %ymm3, %ymm7 +vpand mask5_3_5_3(%rip), %ymm3, %ymm3 +vpermq $206, %ymm7, %ymm7 +vpand mask_keephigh(%rip), %ymm7, %ymm8 +vpor %ymm8, %ymm3, %ymm3 +vpaddw %ymm3, %ymm5, %ymm5 +vmovdqa %xmm7, 2080(%rsp) +vpshufb shuf48_16(%rip), %ymm4, %ymm4 +vpand mask3_5_3_5(%rip), %ymm4, %ymm7 +vpand mask5_3_5_3(%rip), %ymm4, %ymm4 +vpermq $206, %ymm7, %ymm7 +vpand mask_keephigh(%rip), %ymm7, %ymm8 +vpor %ymm8, %ymm4, %ymm4 +vpaddw %ymm4, %ymm6, %ymm6 +vmovdqa %xmm7, 2336(%rsp) +vpshufb shuf48_16(%rip), %ymm11, %ymm11 +vpand mask3_5_3_5(%rip), %ymm11, %ymm7 +vpand mask5_3_5_3(%rip), %ymm11, %ymm11 +vpermq $206, %ymm7, %ymm7 +vpand mask_keephigh(%rip), %ymm7, %ymm8 +vpor %ymm8, %ymm11, %ymm11 +vpaddw %ymm11, %ymm10, %ymm10 +vmovdqa %xmm7, 2592(%rsp) +vpand mask_mod8192(%rip), %ymm5, %ymm5 +vmovdqu %ymm5, 88(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %ymm6, 440(%rdi) +vpand mask_mod8192(%rip), %ymm10, %ymm10 +vmovdqu %ymm10, 792(%rdi) +vpand mask_mod8192(%rip), %ymm9, %ymm9 +vmovdqu %ymm9, 1144(%rdi) +vmovdqa 64(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm4 +vpunpckhwd const0(%rip), %ymm11, %ymm3 +vpslld $1, %ymm4, %ymm4 +vpslld $1, %ymm3, %ymm3 +vmovdqa 320(%rsp), %ymm9 +vpunpcklwd const0(%rip), %ymm9, %ymm10 +vpunpckhwd const0(%rip), %ymm9, %ymm9 +vmovdqa 576(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm5 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm5, %ymm10, %ymm7 +vpaddd %ymm6, %ymm9, %ymm8 +vpsubd %ymm4, %ymm7, %ymm7 +vpsubd %ymm3, %ymm8, %ymm8 +vpsubd %ymm5, %ymm10, %ymm5 +vpsubd %ymm6, %ymm9, %ymm6 +vpsrld $1, %ymm5, %ymm5 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm5, %ymm5 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm5, %ymm6 +vmovdqa 1600(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm9 +vpunpckhwd const0(%rip), %ymm5, %ymm10 +vpslld $1, %ymm9, %ymm9 +vpslld $1, %ymm10, %ymm10 +vpsubd %ymm9, %ymm7, %ymm7 +vpsubd %ymm10, %ymm8, %ymm8 +vpsrld $1, %ymm7, %ymm7 +vpsrld $1, %ymm8, %ymm8 +vpand mask32_to_16(%rip), %ymm7, %ymm7 +vpand mask32_to_16(%rip), %ymm8, %ymm8 +vpackusdw %ymm8, %ymm7, %ymm8 +vmovdqa 832(%rsp), %ymm7 +vpaddw 1088(%rsp), %ymm7, %ymm10 +vpsubw 1088(%rsp), %ymm7, %ymm7 +vpsrlw $2, %ymm7, %ymm7 +vpsubw %ymm6, %ymm7, %ymm7 +vpmullw %ymm14, %ymm7, %ymm7 +vpsllw $1, %ymm11, %ymm9 +vpsubw %ymm9, %ymm10, %ymm9 +vpsllw $7, %ymm5, %ymm10 +vpsubw %ymm10, %ymm9, %ymm10 +vpsrlw $3, %ymm10, %ymm10 +vpsubw %ymm8, %ymm10, %ymm10 +vmovdqa 1344(%rsp), %ymm9 +vpsubw %ymm11, %ymm9, %ymm9 +vpmullw %ymm15, %ymm5, %ymm3 +vpsubw %ymm3, %ymm9, %ymm3 +vpmullw %ymm14, %ymm10, %ymm10 +vpsubw %ymm10, %ymm8, %ymm8 +vpmullw %ymm12, %ymm10, %ymm9 +vpaddw %ymm9, %ymm8, %ymm9 +vpmullw %ymm12, %ymm9, %ymm9 +vpsubw %ymm9, %ymm3, %ymm9 +vpmullw %ymm14, %ymm9, %ymm9 +vpsubw %ymm6, %ymm9, %ymm9 +vpsrlw $3, %ymm9, %ymm9 +vpsubw %ymm7, %ymm9, %ymm9 +vpsubw %ymm9, %ymm7, %ymm7 +vpsubw %ymm7, %ymm6, %ymm6 +vpmullw %ymm13, %ymm9, %ymm9 +vpsubw %ymm9, %ymm6, %ymm6 +vpshufb shuf48_16(%rip), %ymm10, %ymm10 +vpand mask3_5_3_5(%rip), %ymm10, %ymm3 +vpand mask5_3_5_3(%rip), %ymm10, %ymm10 +vpermq $206, %ymm3, %ymm3 +vpand mask_keephigh(%rip), %ymm3, %ymm4 +vpor %ymm4, %ymm10, %ymm10 +vpaddw %ymm10, %ymm11, %ymm11 +vmovdqa %xmm3, 2112(%rsp) +vpshufb shuf48_16(%rip), %ymm9, %ymm9 +vpand mask3_5_3_5(%rip), %ymm9, %ymm3 +vpand mask5_3_5_3(%rip), %ymm9, %ymm9 +vpermq $206, %ymm3, %ymm3 +vpand mask_keephigh(%rip), %ymm3, %ymm4 +vpor %ymm4, %ymm9, %ymm9 +vpaddw %ymm9, %ymm6, %ymm6 +vmovdqa %xmm3, 2368(%rsp) +vpshufb shuf48_16(%rip), %ymm5, %ymm5 +vpand mask3_5_3_5(%rip), %ymm5, %ymm3 +vpand mask5_3_5_3(%rip), %ymm5, %ymm5 +vpermq $206, %ymm3, %ymm3 +vpand mask_keephigh(%rip), %ymm3, %ymm4 +vpor %ymm4, %ymm5, %ymm5 +vpaddw %ymm5, %ymm8, %ymm8 +vmovdqa %xmm3, 2624(%rsp) +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 176(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %ymm6, 528(%rdi) +vpand mask_mod8192(%rip), %ymm8, %ymm8 +vmovdqu %ymm8, 880(%rdi) +vpand mask_mod8192(%rip), %ymm7, %ymm7 +vmovdqu %ymm7, 1232(%rdi) +vmovdqa 96(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm9 +vpunpckhwd const0(%rip), %ymm5, %ymm10 +vpslld $1, %ymm9, %ymm9 +vpslld $1, %ymm10, %ymm10 +vmovdqa 352(%rsp), %ymm7 +vpunpcklwd const0(%rip), %ymm7, %ymm8 +vpunpckhwd const0(%rip), %ymm7, %ymm7 +vmovdqa 608(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm11 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm11, %ymm8, %ymm3 +vpaddd %ymm6, %ymm7, %ymm4 +vpsubd %ymm9, %ymm3, %ymm3 +vpsubd %ymm10, %ymm4, %ymm4 +vpsubd %ymm11, %ymm8, %ymm11 +vpsubd %ymm6, %ymm7, %ymm6 +vpsrld $1, %ymm11, %ymm11 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm11, %ymm11 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm11, %ymm6 +vmovdqa 1632(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm7 +vpunpckhwd const0(%rip), %ymm11, %ymm8 +vpslld $1, %ymm7, %ymm7 +vpslld $1, %ymm8, %ymm8 +vpsubd %ymm7, %ymm3, %ymm3 +vpsubd %ymm8, %ymm4, %ymm4 +vpsrld $1, %ymm3, %ymm3 +vpsrld $1, %ymm4, %ymm4 +vpand mask32_to_16(%rip), %ymm3, %ymm3 +vpand mask32_to_16(%rip), %ymm4, %ymm4 +vpackusdw %ymm4, %ymm3, %ymm4 +vmovdqa 864(%rsp), %ymm3 +vpaddw 1120(%rsp), %ymm3, %ymm8 +vpsubw 1120(%rsp), %ymm3, %ymm3 +vpsrlw $2, %ymm3, %ymm3 +vpsubw %ymm6, %ymm3, %ymm3 +vpmullw %ymm14, %ymm3, %ymm3 +vpsllw $1, %ymm5, %ymm7 +vpsubw %ymm7, %ymm8, %ymm7 +vpsllw $7, %ymm11, %ymm8 +vpsubw %ymm8, %ymm7, %ymm8 +vpsrlw $3, %ymm8, %ymm8 +vpsubw %ymm4, %ymm8, %ymm8 +vmovdqa 1376(%rsp), %ymm7 +vpsubw %ymm5, %ymm7, %ymm7 +vpmullw %ymm15, %ymm11, %ymm10 +vpsubw %ymm10, %ymm7, %ymm10 +vpmullw %ymm14, %ymm8, %ymm8 +vpsubw %ymm8, %ymm4, %ymm4 +vpmullw %ymm12, %ymm8, %ymm7 +vpaddw %ymm7, %ymm4, %ymm7 +vpmullw %ymm12, %ymm7, %ymm7 +vpsubw %ymm7, %ymm10, %ymm7 +vpmullw %ymm14, %ymm7, %ymm7 +vpsubw %ymm6, %ymm7, %ymm7 +vpsrlw $3, %ymm7, %ymm7 +vpsubw %ymm3, %ymm7, %ymm7 +vpsubw %ymm7, %ymm3, %ymm3 +vpsubw %ymm3, %ymm6, %ymm6 +vpmullw %ymm13, %ymm7, %ymm7 +vpsubw %ymm7, %ymm6, %ymm6 +vpshufb shuf48_16(%rip), %ymm8, %ymm8 +vpand mask3_5_3_5(%rip), %ymm8, %ymm10 +vpand mask5_3_5_3(%rip), %ymm8, %ymm8 +vpermq $206, %ymm10, %ymm10 +vpand mask_keephigh(%rip), %ymm10, %ymm9 +vpor %ymm9, %ymm8, %ymm8 +vpaddw %ymm8, %ymm5, %ymm5 +vmovdqa %xmm10, 2144(%rsp) +vpshufb shuf48_16(%rip), %ymm7, %ymm7 +vpand mask3_5_3_5(%rip), %ymm7, %ymm10 +vpand mask5_3_5_3(%rip), %ymm7, %ymm7 +vpermq $206, %ymm10, %ymm10 +vpand mask_keephigh(%rip), %ymm10, %ymm9 +vpor %ymm9, %ymm7, %ymm7 +vpaddw %ymm7, %ymm6, %ymm6 +vmovdqa %xmm10, 2400(%rsp) +vpshufb shuf48_16(%rip), %ymm11, %ymm11 +vpand mask3_5_3_5(%rip), %ymm11, %ymm10 +vpand mask5_3_5_3(%rip), %ymm11, %ymm11 +vpermq $206, %ymm10, %ymm10 +vpand mask_keephigh(%rip), %ymm10, %ymm9 +vpor %ymm9, %ymm11, %ymm11 +vpaddw %ymm11, %ymm4, %ymm4 +vmovdqa %xmm10, 2656(%rsp) +vpand mask_mod8192(%rip), %ymm5, %ymm5 +vmovdqu %ymm5, 264(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %ymm6, 616(%rdi) +vpand mask_mod8192(%rip), %ymm4, %ymm4 +vmovdqu %ymm4, 968(%rdi) +vpand mask_mod8192(%rip), %ymm3, %ymm3 +vmovdqu %ymm3, 1320(%rdi) +vmovdqa 128(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm7 +vpunpckhwd const0(%rip), %ymm11, %ymm8 +vpslld $1, %ymm7, %ymm7 +vpslld $1, %ymm8, %ymm8 +vmovdqa 384(%rsp), %ymm3 +vpunpcklwd const0(%rip), %ymm3, %ymm4 +vpunpckhwd const0(%rip), %ymm3, %ymm3 +vmovdqa 640(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm5 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm5, %ymm4, %ymm10 +vpaddd %ymm6, %ymm3, %ymm9 +vpsubd %ymm7, %ymm10, %ymm10 +vpsubd %ymm8, %ymm9, %ymm9 +vpsubd %ymm5, %ymm4, %ymm5 +vpsubd %ymm6, %ymm3, %ymm6 +vpsrld $1, %ymm5, %ymm5 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm5, %ymm5 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm5, %ymm6 +vmovdqa 1664(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm3 +vpunpckhwd const0(%rip), %ymm5, %ymm4 +vpslld $1, %ymm3, %ymm3 +vpslld $1, %ymm4, %ymm4 +vpsubd %ymm3, %ymm10, %ymm10 +vpsubd %ymm4, %ymm9, %ymm9 +vpsrld $1, %ymm10, %ymm10 +vpsrld $1, %ymm9, %ymm9 +vpand mask32_to_16(%rip), %ymm10, %ymm10 +vpand mask32_to_16(%rip), %ymm9, %ymm9 +vpackusdw %ymm9, %ymm10, %ymm9 +vmovdqa 896(%rsp), %ymm10 +vpaddw 1152(%rsp), %ymm10, %ymm4 +vpsubw 1152(%rsp), %ymm10, %ymm10 +vpsrlw $2, %ymm10, %ymm10 +vpsubw %ymm6, %ymm10, %ymm10 +vpmullw %ymm14, %ymm10, %ymm10 +vpsllw $1, %ymm11, %ymm3 +vpsubw %ymm3, %ymm4, %ymm3 +vpsllw $7, %ymm5, %ymm4 +vpsubw %ymm4, %ymm3, %ymm4 +vpsrlw $3, %ymm4, %ymm4 +vpsubw %ymm9, %ymm4, %ymm4 +vmovdqa 1408(%rsp), %ymm3 +vpsubw %ymm11, %ymm3, %ymm3 +vpmullw %ymm15, %ymm5, %ymm8 +vpsubw %ymm8, %ymm3, %ymm8 +vpmullw %ymm14, %ymm4, %ymm4 +vpsubw %ymm4, %ymm9, %ymm9 +vpmullw %ymm12, %ymm4, %ymm3 +vpaddw %ymm3, %ymm9, %ymm3 +vpmullw %ymm12, %ymm3, %ymm3 +vpsubw %ymm3, %ymm8, %ymm3 +vpmullw %ymm14, %ymm3, %ymm3 +vpsubw %ymm6, %ymm3, %ymm3 +vpsrlw $3, %ymm3, %ymm3 +vpsubw %ymm10, %ymm3, %ymm3 +vpsubw %ymm3, %ymm10, %ymm10 +vpsubw %ymm10, %ymm6, %ymm6 +vpmullw %ymm13, %ymm3, %ymm3 +vpsubw %ymm3, %ymm6, %ymm6 +vmovdqu 352(%rdi), %ymm8 +vmovdqu 704(%rdi), %ymm7 +vmovdqu 1056(%rdi), %ymm2 +vpaddw %ymm11, %ymm8, %ymm11 +vpaddw %ymm6, %ymm7, %ymm6 +vpaddw %ymm9, %ymm2, %ymm9 +vpshufb shuf48_16(%rip), %ymm10, %ymm10 +vpand mask3_5_3_5(%rip), %ymm10, %ymm2 +vpand mask5_3_5_3(%rip), %ymm10, %ymm10 +vpermq $206, %ymm2, %ymm2 +vpand mask_keephigh(%rip), %ymm2, %ymm7 +vpor %ymm7, %ymm10, %ymm10 +vmovdqu 0(%rdi), %ymm7 +vpaddw %ymm10, %ymm7, %ymm7 +vpand mask_mod8192(%rip), %ymm7, %ymm7 +vmovdqu %ymm7, 0(%rdi) +vmovdqa %xmm2, 1920(%rsp) +vpshufb shuf48_16(%rip), %ymm4, %ymm4 +vpand mask3_5_3_5(%rip), %ymm4, %ymm2 +vpand mask5_3_5_3(%rip), %ymm4, %ymm4 +vpermq $206, %ymm2, %ymm2 +vpand mask_keephigh(%rip), %ymm2, %ymm7 +vpor %ymm7, %ymm4, %ymm4 +vpaddw %ymm4, %ymm11, %ymm11 +vmovdqa %xmm2, 2176(%rsp) +vpshufb shuf48_16(%rip), %ymm3, %ymm3 +vpand mask3_5_3_5(%rip), %ymm3, %ymm2 +vpand mask5_3_5_3(%rip), %ymm3, %ymm3 +vpermq $206, %ymm2, %ymm2 +vpand mask_keephigh(%rip), %ymm2, %ymm7 +vpor %ymm7, %ymm3, %ymm3 +vpaddw %ymm3, %ymm6, %ymm6 +vmovdqa %xmm2, 2432(%rsp) +vpshufb shuf48_16(%rip), %ymm5, %ymm5 +vpand mask3_5_3_5(%rip), %ymm5, %ymm2 +vpand mask5_3_5_3(%rip), %ymm5, %ymm5 +vpermq $206, %ymm2, %ymm2 +vpand mask_keephigh(%rip), %ymm2, %ymm7 +vpor %ymm7, %ymm5, %ymm5 +vpaddw %ymm5, %ymm9, %ymm9 +vmovdqa %xmm2, 2688(%rsp) +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 352(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %ymm6, 704(%rdi) +vpand mask_mod8192(%rip), %ymm9, %ymm9 +vmovdqu %ymm9, 1056(%rdi) +vmovdqa 160(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm3 +vpunpckhwd const0(%rip), %ymm5, %ymm4 +vpslld $1, %ymm3, %ymm3 +vpslld $1, %ymm4, %ymm4 +vmovdqa 416(%rsp), %ymm10 +vpunpcklwd const0(%rip), %ymm10, %ymm9 +vpunpckhwd const0(%rip), %ymm10, %ymm10 +vmovdqa 672(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm11 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm11, %ymm9, %ymm2 +vpaddd %ymm6, %ymm10, %ymm7 +vpsubd %ymm3, %ymm2, %ymm2 +vpsubd %ymm4, %ymm7, %ymm7 +vpsubd %ymm11, %ymm9, %ymm11 +vpsubd %ymm6, %ymm10, %ymm6 +vpsrld $1, %ymm11, %ymm11 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm11, %ymm11 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm11, %ymm6 +vmovdqa 1696(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm10 +vpunpckhwd const0(%rip), %ymm11, %ymm9 +vpslld $1, %ymm10, %ymm10 +vpslld $1, %ymm9, %ymm9 +vpsubd %ymm10, %ymm2, %ymm2 +vpsubd %ymm9, %ymm7, %ymm7 +vpsrld $1, %ymm2, %ymm2 +vpsrld $1, %ymm7, %ymm7 +vpand mask32_to_16(%rip), %ymm2, %ymm2 +vpand mask32_to_16(%rip), %ymm7, %ymm7 +vpackusdw %ymm7, %ymm2, %ymm7 +vmovdqa 928(%rsp), %ymm2 +vpaddw 1184(%rsp), %ymm2, %ymm9 +vpsubw 1184(%rsp), %ymm2, %ymm2 +vpsrlw $2, %ymm2, %ymm2 +vpsubw %ymm6, %ymm2, %ymm2 +vpmullw %ymm14, %ymm2, %ymm2 +vpsllw $1, %ymm5, %ymm10 +vpsubw %ymm10, %ymm9, %ymm10 +vpsllw $7, %ymm11, %ymm9 +vpsubw %ymm9, %ymm10, %ymm9 +vpsrlw $3, %ymm9, %ymm9 +vpsubw %ymm7, %ymm9, %ymm9 +vmovdqa 1440(%rsp), %ymm10 +vpsubw %ymm5, %ymm10, %ymm10 +vpmullw %ymm15, %ymm11, %ymm4 +vpsubw %ymm4, %ymm10, %ymm4 +vpmullw %ymm14, %ymm9, %ymm9 +vpsubw %ymm9, %ymm7, %ymm7 +vpmullw %ymm12, %ymm9, %ymm10 +vpaddw %ymm10, %ymm7, %ymm10 +vpmullw %ymm12, %ymm10, %ymm10 +vpsubw %ymm10, %ymm4, %ymm10 +vpmullw %ymm14, %ymm10, %ymm10 +vpsubw %ymm6, %ymm10, %ymm10 +vpsrlw $3, %ymm10, %ymm10 +vpsubw %ymm2, %ymm10, %ymm10 +vpsubw %ymm10, %ymm2, %ymm2 +vpsubw %ymm2, %ymm6, %ymm6 +vpmullw %ymm13, %ymm10, %ymm10 +vpsubw %ymm10, %ymm6, %ymm6 +vmovdqu 440(%rdi), %ymm4 +vmovdqu 792(%rdi), %ymm3 +vmovdqu 1144(%rdi), %ymm8 +vpaddw %ymm5, %ymm4, %ymm5 +vpaddw %ymm6, %ymm3, %ymm6 +vpaddw %ymm7, %ymm8, %ymm7 +vpshufb shuf48_16(%rip), %ymm2, %ymm2 +vpand mask3_5_3_5(%rip), %ymm2, %ymm8 +vpand mask5_3_5_3(%rip), %ymm2, %ymm2 +vpermq $206, %ymm8, %ymm8 +vpand mask_keephigh(%rip), %ymm8, %ymm3 +vpor %ymm3, %ymm2, %ymm2 +vmovdqu 88(%rdi), %ymm3 +vpaddw %ymm2, %ymm3, %ymm3 +vpand mask_mod8192(%rip), %ymm3, %ymm3 +vmovdqu %ymm3, 88(%rdi) +vmovdqa %xmm8, 1952(%rsp) +vpshufb shuf48_16(%rip), %ymm9, %ymm9 +vpand mask3_5_3_5(%rip), %ymm9, %ymm8 +vpand mask5_3_5_3(%rip), %ymm9, %ymm9 +vpermq $206, %ymm8, %ymm8 +vpand mask_keephigh(%rip), %ymm8, %ymm3 +vpor %ymm3, %ymm9, %ymm9 +vpaddw %ymm9, %ymm5, %ymm5 +vmovdqa %xmm8, 2208(%rsp) +vpshufb shuf48_16(%rip), %ymm10, %ymm10 +vpand mask3_5_3_5(%rip), %ymm10, %ymm8 +vpand mask5_3_5_3(%rip), %ymm10, %ymm10 +vpermq $206, %ymm8, %ymm8 +vpand mask_keephigh(%rip), %ymm8, %ymm3 +vpor %ymm3, %ymm10, %ymm10 +vpaddw %ymm10, %ymm6, %ymm6 +vmovdqa %xmm8, 2464(%rsp) +vpshufb shuf48_16(%rip), %ymm11, %ymm11 +vpand mask3_5_3_5(%rip), %ymm11, %ymm8 +vpand mask5_3_5_3(%rip), %ymm11, %ymm11 +vpermq $206, %ymm8, %ymm8 +vpand mask_keephigh(%rip), %ymm8, %ymm3 +vpor %ymm3, %ymm11, %ymm11 +vpaddw %ymm11, %ymm7, %ymm7 +vmovdqa %xmm8, 2720(%rsp) +vpand mask_mod8192(%rip), %ymm5, %ymm5 +vmovdqu %ymm5, 440(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %ymm6, 792(%rdi) +vpand mask_mod8192(%rip), %ymm7, %ymm7 +vmovdqu %ymm7, 1144(%rdi) +vmovdqa 192(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm10 +vpunpckhwd const0(%rip), %ymm11, %ymm9 +vpslld $1, %ymm10, %ymm10 +vpslld $1, %ymm9, %ymm9 +vmovdqa 448(%rsp), %ymm2 +vpunpcklwd const0(%rip), %ymm2, %ymm7 +vpunpckhwd const0(%rip), %ymm2, %ymm2 +vmovdqa 704(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm5 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm5, %ymm7, %ymm8 +vpaddd %ymm6, %ymm2, %ymm3 +vpsubd %ymm10, %ymm8, %ymm8 +vpsubd %ymm9, %ymm3, %ymm3 +vpsubd %ymm5, %ymm7, %ymm5 +vpsubd %ymm6, %ymm2, %ymm6 +vpsrld $1, %ymm5, %ymm5 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm5, %ymm5 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm5, %ymm6 +vmovdqa 1728(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm2 +vpunpckhwd const0(%rip), %ymm5, %ymm7 +vpslld $1, %ymm2, %ymm2 +vpslld $1, %ymm7, %ymm7 +vpsubd %ymm2, %ymm8, %ymm8 +vpsubd %ymm7, %ymm3, %ymm3 +vpsrld $1, %ymm8, %ymm8 +vpsrld $1, %ymm3, %ymm3 +vpand mask32_to_16(%rip), %ymm8, %ymm8 +vpand mask32_to_16(%rip), %ymm3, %ymm3 +vpackusdw %ymm3, %ymm8, %ymm3 +vmovdqa 960(%rsp), %ymm8 +vpaddw 1216(%rsp), %ymm8, %ymm7 +vpsubw 1216(%rsp), %ymm8, %ymm8 +vpsrlw $2, %ymm8, %ymm8 +vpsubw %ymm6, %ymm8, %ymm8 +vpmullw %ymm14, %ymm8, %ymm8 +vpsllw $1, %ymm11, %ymm2 +vpsubw %ymm2, %ymm7, %ymm2 +vpsllw $7, %ymm5, %ymm7 +vpsubw %ymm7, %ymm2, %ymm7 +vpsrlw $3, %ymm7, %ymm7 +vpsubw %ymm3, %ymm7, %ymm7 +vmovdqa 1472(%rsp), %ymm2 +vpsubw %ymm11, %ymm2, %ymm2 +vpmullw %ymm15, %ymm5, %ymm9 +vpsubw %ymm9, %ymm2, %ymm9 +vpmullw %ymm14, %ymm7, %ymm7 +vpsubw %ymm7, %ymm3, %ymm3 +vpmullw %ymm12, %ymm7, %ymm2 +vpaddw %ymm2, %ymm3, %ymm2 +vpmullw %ymm12, %ymm2, %ymm2 +vpsubw %ymm2, %ymm9, %ymm2 +vpmullw %ymm14, %ymm2, %ymm2 +vpsubw %ymm6, %ymm2, %ymm2 +vpsrlw $3, %ymm2, %ymm2 +vpsubw %ymm8, %ymm2, %ymm2 +vpsubw %ymm2, %ymm8, %ymm8 +vpsubw %ymm8, %ymm6, %ymm6 +vpmullw %ymm13, %ymm2, %ymm2 +vpsubw %ymm2, %ymm6, %ymm6 +vmovdqu 528(%rdi), %ymm9 +vmovdqu 880(%rdi), %ymm10 +vmovdqu 1232(%rdi), %ymm4 +vpaddw %ymm11, %ymm9, %ymm11 +vpaddw %ymm6, %ymm10, %ymm6 +vpaddw %ymm3, %ymm4, %ymm3 +vpshufb shuf48_16(%rip), %ymm8, %ymm8 +vpand mask3_5_3_5(%rip), %ymm8, %ymm4 +vpand mask5_3_5_3(%rip), %ymm8, %ymm8 +vpermq $206, %ymm4, %ymm4 +vpand mask_keephigh(%rip), %ymm4, %ymm10 +vpor %ymm10, %ymm8, %ymm8 +vmovdqu 176(%rdi), %ymm10 +vpaddw %ymm8, %ymm10, %ymm10 +vpand mask_mod8192(%rip), %ymm10, %ymm10 +vmovdqu %ymm10, 176(%rdi) +vmovdqa %xmm4, 1984(%rsp) +vpshufb shuf48_16(%rip), %ymm7, %ymm7 +vpand mask3_5_3_5(%rip), %ymm7, %ymm4 +vpand mask5_3_5_3(%rip), %ymm7, %ymm7 +vpermq $206, %ymm4, %ymm4 +vpand mask_keephigh(%rip), %ymm4, %ymm10 +vpor %ymm10, %ymm7, %ymm7 +vpaddw %ymm7, %ymm11, %ymm11 +vmovdqa %xmm4, 2240(%rsp) +vpshufb shuf48_16(%rip), %ymm2, %ymm2 +vpand mask3_5_3_5(%rip), %ymm2, %ymm4 +vpand mask5_3_5_3(%rip), %ymm2, %ymm2 +vpermq $206, %ymm4, %ymm4 +vpand mask_keephigh(%rip), %ymm4, %ymm10 +vpor %ymm10, %ymm2, %ymm2 +vpaddw %ymm2, %ymm6, %ymm6 +vmovdqa %xmm4, 2496(%rsp) +vpshufb shuf48_16(%rip), %ymm5, %ymm5 +vpand mask3_5_3_5(%rip), %ymm5, %ymm4 +vpand mask5_3_5_3(%rip), %ymm5, %ymm5 +vpermq $206, %ymm4, %ymm4 +vpand mask_keephigh(%rip), %ymm4, %ymm10 +vpor %ymm10, %ymm5, %ymm5 +vpaddw %ymm5, %ymm3, %ymm3 +vmovdqa %xmm4, 2752(%rsp) +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 528(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %ymm6, 880(%rdi) +vpand mask_mod8192(%rip), %ymm3, %ymm3 +vmovdqu %ymm3, 1232(%rdi) +vmovdqa 224(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm2 +vpunpckhwd const0(%rip), %ymm5, %ymm7 +vpslld $1, %ymm2, %ymm2 +vpslld $1, %ymm7, %ymm7 +vmovdqa 480(%rsp), %ymm8 +vpunpcklwd const0(%rip), %ymm8, %ymm3 +vpunpckhwd const0(%rip), %ymm8, %ymm8 +vmovdqa 736(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm11 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm11, %ymm3, %ymm4 +vpaddd %ymm6, %ymm8, %ymm10 +vpsubd %ymm2, %ymm4, %ymm4 +vpsubd %ymm7, %ymm10, %ymm10 +vpsubd %ymm11, %ymm3, %ymm11 +vpsubd %ymm6, %ymm8, %ymm6 +vpsrld $1, %ymm11, %ymm11 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm11, %ymm11 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm11, %ymm6 +vmovdqa 1760(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm8 +vpunpckhwd const0(%rip), %ymm11, %ymm3 +vpslld $1, %ymm8, %ymm8 +vpslld $1, %ymm3, %ymm3 +vpsubd %ymm8, %ymm4, %ymm4 +vpsubd %ymm3, %ymm10, %ymm10 +vpsrld $1, %ymm4, %ymm4 +vpsrld $1, %ymm10, %ymm10 +vpand mask32_to_16(%rip), %ymm4, %ymm4 +vpand mask32_to_16(%rip), %ymm10, %ymm10 +vpackusdw %ymm10, %ymm4, %ymm10 +vmovdqa 992(%rsp), %ymm4 +vpaddw 1248(%rsp), %ymm4, %ymm3 +vpsubw 1248(%rsp), %ymm4, %ymm4 +vpsrlw $2, %ymm4, %ymm4 +vpsubw %ymm6, %ymm4, %ymm4 +vpmullw %ymm14, %ymm4, %ymm4 +vpsllw $1, %ymm5, %ymm8 +vpsubw %ymm8, %ymm3, %ymm8 +vpsllw $7, %ymm11, %ymm3 +vpsubw %ymm3, %ymm8, %ymm3 +vpsrlw $3, %ymm3, %ymm3 +vpsubw %ymm10, %ymm3, %ymm3 +vmovdqa 1504(%rsp), %ymm8 +vpsubw %ymm5, %ymm8, %ymm8 +vpmullw %ymm15, %ymm11, %ymm7 +vpsubw %ymm7, %ymm8, %ymm7 +vpmullw %ymm14, %ymm3, %ymm3 +vpsubw %ymm3, %ymm10, %ymm10 +vpmullw %ymm12, %ymm3, %ymm8 +vpaddw %ymm8, %ymm10, %ymm8 +vpmullw %ymm12, %ymm8, %ymm8 +vpsubw %ymm8, %ymm7, %ymm8 +vpmullw %ymm14, %ymm8, %ymm8 +vpsubw %ymm6, %ymm8, %ymm8 +vpsrlw $3, %ymm8, %ymm8 +vpsubw %ymm4, %ymm8, %ymm8 +vpsubw %ymm8, %ymm4, %ymm4 +vpsubw %ymm4, %ymm6, %ymm6 +vpmullw %ymm13, %ymm8, %ymm8 +vpsubw %ymm8, %ymm6, %ymm6 +vmovdqu 616(%rdi), %ymm7 +vmovdqu 968(%rdi), %ymm2 +vmovdqu 1320(%rdi), %ymm9 +vpaddw %ymm5, %ymm7, %ymm5 +vpaddw %ymm6, %ymm2, %ymm6 +vpaddw %ymm10, %ymm9, %ymm10 +vpshufb shuf48_16(%rip), %ymm4, %ymm4 +vpand mask3_5_3_5(%rip), %ymm4, %ymm9 +vpand mask5_3_5_3(%rip), %ymm4, %ymm4 +vpermq $206, %ymm9, %ymm9 +vpand mask_keephigh(%rip), %ymm9, %ymm2 +vpor %ymm2, %ymm4, %ymm4 +vmovdqu 264(%rdi), %ymm2 +vpaddw %ymm4, %ymm2, %ymm2 +vpand mask_mod8192(%rip), %ymm2, %ymm2 +vmovdqu %ymm2, 264(%rdi) +vmovdqa %xmm9, 2016(%rsp) +vpshufb shuf48_16(%rip), %ymm3, %ymm3 +vpand mask3_5_3_5(%rip), %ymm3, %ymm9 +vpand mask5_3_5_3(%rip), %ymm3, %ymm3 +vpermq $206, %ymm9, %ymm9 +vpand mask_keephigh(%rip), %ymm9, %ymm2 +vpor %ymm2, %ymm3, %ymm3 +vpaddw %ymm3, %ymm5, %ymm5 +vmovdqa %xmm9, 2272(%rsp) +vpshufb shuf48_16(%rip), %ymm8, %ymm8 +vpand mask3_5_3_5(%rip), %ymm8, %ymm9 +vpand mask5_3_5_3(%rip), %ymm8, %ymm8 +vpermq $206, %ymm9, %ymm9 +vpand mask_keephigh(%rip), %ymm9, %ymm2 +vpor %ymm2, %ymm8, %ymm8 +vpaddw %ymm8, %ymm6, %ymm6 +vmovdqa %xmm9, 2528(%rsp) +vpshufb shuf48_16(%rip), %ymm11, %ymm11 +vpand mask3_5_3_5(%rip), %ymm11, %ymm9 +vpand mask5_3_5_3(%rip), %ymm11, %ymm11 +vpermq $206, %ymm9, %ymm9 +vpand mask_keephigh(%rip), %ymm9, %ymm2 +vpor %ymm2, %ymm11, %ymm11 +vpaddw %ymm11, %ymm10, %ymm10 +vmovdqa %xmm9, 2784(%rsp) +vpand mask_mod8192(%rip), %ymm5, %ymm5 +vmovdqu %ymm5, 616(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %ymm6, 968(%rdi) +vpand mask_mod8192(%rip), %ymm10, %ymm10 +vmovdqu %ymm10, 1320(%rdi) +vmovdqa 128(%r12), %ymm0 +vpsubw 224(%r12), %ymm0, %ymm0 +vmovdqa 512(%r12), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 320(%r12), %ymm1, %ymm1 +vpsubw 32(%r12), %ymm0, %ymm0 +vpaddw 416(%r12), %ymm0, %ymm0 +vmovdqa 704(%r12), %ymm2 +vpsubw 800(%r12), %ymm2, %ymm2 +vmovdqa 1088(%r12), %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw 896(%r12), %ymm3, %ymm3 +vpsubw 608(%r12), %ymm2, %ymm2 +vpaddw 992(%r12), %ymm2, %ymm2 +vmovdqa 1280(%r12), %ymm4 +vpsubw 1376(%r12), %ymm4, %ymm4 +vmovdqa 1664(%r12), %ymm5 +vpsubw %ymm4, %ymm5, %ymm5 +vpsubw 1472(%r12), %ymm5, %ymm5 +vpsubw 1184(%r12), %ymm4, %ymm4 +vpaddw 1568(%r12), %ymm4, %ymm4 +vpsubw 608(%r12), %ymm1, %ymm1 +vpsubw %ymm1, %ymm5, %ymm5 +vpsubw %ymm3, %ymm5, %ymm5 +vpsubw 32(%r12), %ymm1, %ymm1 +vpaddw 1184(%r12), %ymm1, %ymm1 +vmovdqa 320(%r12), %ymm6 +vpsubw %ymm2, %ymm6, %ymm7 +vmovdqa 1472(%r12), %ymm2 +vpsubw %ymm7, %ymm2, %ymm2 +vpsubw 896(%r12), %ymm2, %ymm2 +vpsubw %ymm0, %ymm7, %ymm7 +vpaddw %ymm4, %ymm7, %ymm7 +vmovdqa 32(%r12), %ymm8 +vmovdqa 896(%r12), %ymm9 +vmovdqa %ymm8, 0(%rsp) +vmovdqa %ymm0, 32(%rsp) +vmovdqa %ymm1, 64(%rsp) +vmovdqa %ymm7, 96(%rsp) +vmovdqa %ymm5, 128(%rsp) +vmovdqa %ymm2, 160(%rsp) +vmovdqa %ymm3, 192(%rsp) +vmovdqa %ymm9, 224(%rsp) +vmovdqa 1856(%r12), %ymm0 +vpsubw 1952(%r12), %ymm0, %ymm0 +vmovdqa 2240(%r12), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 2048(%r12), %ymm1, %ymm1 +vpsubw 1760(%r12), %ymm0, %ymm0 +vpaddw 2144(%r12), %ymm0, %ymm0 +vmovdqa 2432(%r12), %ymm2 +vpsubw 2528(%r12), %ymm2, %ymm2 +vmovdqa 2816(%r12), %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw 2624(%r12), %ymm3, %ymm3 +vpsubw 2336(%r12), %ymm2, %ymm2 +vpaddw 2720(%r12), %ymm2, %ymm2 +vmovdqa 3008(%r12), %ymm4 +vpsubw 3104(%r12), %ymm4, %ymm4 +vmovdqa 3392(%r12), %ymm5 +vpsubw %ymm4, %ymm5, %ymm5 +vpsubw 3200(%r12), %ymm5, %ymm5 +vpsubw 2912(%r12), %ymm4, %ymm4 +vpaddw 3296(%r12), %ymm4, %ymm4 +vpsubw 2336(%r12), %ymm1, %ymm1 +vpsubw %ymm1, %ymm5, %ymm5 +vpsubw %ymm3, %ymm5, %ymm5 +vpsubw 1760(%r12), %ymm1, %ymm1 +vpaddw 2912(%r12), %ymm1, %ymm1 +vmovdqa 2048(%r12), %ymm6 +vpsubw %ymm2, %ymm6, %ymm7 +vmovdqa 3200(%r12), %ymm2 +vpsubw %ymm7, %ymm2, %ymm2 +vpsubw 2624(%r12), %ymm2, %ymm2 +vpsubw %ymm0, %ymm7, %ymm7 +vpaddw %ymm4, %ymm7, %ymm7 +vmovdqa 1760(%r12), %ymm8 +vmovdqa 2624(%r12), %ymm9 +vmovdqa %ymm8, 256(%rsp) +vmovdqa %ymm0, 288(%rsp) +vmovdqa %ymm1, 320(%rsp) +vmovdqa %ymm7, 352(%rsp) +vmovdqa %ymm5, 384(%rsp) +vmovdqa %ymm2, 416(%rsp) +vmovdqa %ymm3, 448(%rsp) +vmovdqa %ymm9, 480(%rsp) +vmovdqa 3584(%r12), %ymm0 +vpsubw 3680(%r12), %ymm0, %ymm0 +vmovdqa 3968(%r12), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 3776(%r12), %ymm1, %ymm1 +vpsubw 3488(%r12), %ymm0, %ymm0 +vpaddw 3872(%r12), %ymm0, %ymm0 +vmovdqa 4160(%r12), %ymm2 +vpsubw 4256(%r12), %ymm2, %ymm2 +vmovdqa 4544(%r12), %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw 4352(%r12), %ymm3, %ymm3 +vpsubw 4064(%r12), %ymm2, %ymm2 +vpaddw 4448(%r12), %ymm2, %ymm2 +vmovdqa 4736(%r12), %ymm4 +vpsubw 4832(%r12), %ymm4, %ymm4 +vmovdqa 5120(%r12), %ymm5 +vpsubw %ymm4, %ymm5, %ymm5 +vpsubw 4928(%r12), %ymm5, %ymm5 +vpsubw 4640(%r12), %ymm4, %ymm4 +vpaddw 5024(%r12), %ymm4, %ymm4 +vpsubw 4064(%r12), %ymm1, %ymm1 +vpsubw %ymm1, %ymm5, %ymm5 +vpsubw %ymm3, %ymm5, %ymm5 +vpsubw 3488(%r12), %ymm1, %ymm1 +vpaddw 4640(%r12), %ymm1, %ymm1 +vmovdqa 3776(%r12), %ymm6 +vpsubw %ymm2, %ymm6, %ymm7 +vmovdqa 4928(%r12), %ymm2 +vpsubw %ymm7, %ymm2, %ymm2 +vpsubw 4352(%r12), %ymm2, %ymm2 +vpsubw %ymm0, %ymm7, %ymm7 +vpaddw %ymm4, %ymm7, %ymm7 +vmovdqa 3488(%r12), %ymm8 +vmovdqa 4352(%r12), %ymm9 +vmovdqa %ymm8, 512(%rsp) +vmovdqa %ymm0, 544(%rsp) +vmovdqa %ymm1, 576(%rsp) +vmovdqa %ymm7, 608(%rsp) +vmovdqa %ymm5, 640(%rsp) +vmovdqa %ymm2, 672(%rsp) +vmovdqa %ymm3, 704(%rsp) +vmovdqa %ymm9, 736(%rsp) +vmovdqa 5312(%r12), %ymm0 +vpsubw 5408(%r12), %ymm0, %ymm0 +vmovdqa 5696(%r12), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 5504(%r12), %ymm1, %ymm1 +vpsubw 5216(%r12), %ymm0, %ymm0 +vpaddw 5600(%r12), %ymm0, %ymm0 +vmovdqa 5888(%r12), %ymm2 +vpsubw 5984(%r12), %ymm2, %ymm2 +vmovdqa 6272(%r12), %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw 6080(%r12), %ymm3, %ymm3 +vpsubw 5792(%r12), %ymm2, %ymm2 +vpaddw 6176(%r12), %ymm2, %ymm2 +vmovdqa 6464(%r12), %ymm4 +vpsubw 6560(%r12), %ymm4, %ymm4 +vmovdqa 6848(%r12), %ymm5 +vpsubw %ymm4, %ymm5, %ymm5 +vpsubw 6656(%r12), %ymm5, %ymm5 +vpsubw 6368(%r12), %ymm4, %ymm4 +vpaddw 6752(%r12), %ymm4, %ymm4 +vpsubw 5792(%r12), %ymm1, %ymm1 +vpsubw %ymm1, %ymm5, %ymm5 +vpsubw %ymm3, %ymm5, %ymm5 +vpsubw 5216(%r12), %ymm1, %ymm1 +vpaddw 6368(%r12), %ymm1, %ymm1 +vmovdqa 5504(%r12), %ymm6 +vpsubw %ymm2, %ymm6, %ymm7 +vmovdqa 6656(%r12), %ymm2 +vpsubw %ymm7, %ymm2, %ymm2 +vpsubw 6080(%r12), %ymm2, %ymm2 +vpsubw %ymm0, %ymm7, %ymm7 +vpaddw %ymm4, %ymm7, %ymm7 +vmovdqa 5216(%r12), %ymm8 +vmovdqa 6080(%r12), %ymm9 +vmovdqa %ymm8, 768(%rsp) +vmovdqa %ymm0, 800(%rsp) +vmovdqa %ymm1, 832(%rsp) +vmovdqa %ymm7, 864(%rsp) +vmovdqa %ymm5, 896(%rsp) +vmovdqa %ymm2, 928(%rsp) +vmovdqa %ymm3, 960(%rsp) +vmovdqa %ymm9, 992(%rsp) +vmovdqa 7040(%r12), %ymm0 +vpsubw 7136(%r12), %ymm0, %ymm0 +vmovdqa 7424(%r12), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 7232(%r12), %ymm1, %ymm1 +vpsubw 6944(%r12), %ymm0, %ymm0 +vpaddw 7328(%r12), %ymm0, %ymm0 +vmovdqa 7616(%r12), %ymm2 +vpsubw 7712(%r12), %ymm2, %ymm2 +vmovdqa 8000(%r12), %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw 7808(%r12), %ymm3, %ymm3 +vpsubw 7520(%r12), %ymm2, %ymm2 +vpaddw 7904(%r12), %ymm2, %ymm2 +vmovdqa 8192(%r12), %ymm4 +vpsubw 8288(%r12), %ymm4, %ymm4 +vmovdqa 8576(%r12), %ymm5 +vpsubw %ymm4, %ymm5, %ymm5 +vpsubw 8384(%r12), %ymm5, %ymm5 +vpsubw 8096(%r12), %ymm4, %ymm4 +vpaddw 8480(%r12), %ymm4, %ymm4 +vpsubw 7520(%r12), %ymm1, %ymm1 +vpsubw %ymm1, %ymm5, %ymm5 +vpsubw %ymm3, %ymm5, %ymm5 +vpsubw 6944(%r12), %ymm1, %ymm1 +vpaddw 8096(%r12), %ymm1, %ymm1 +vmovdqa 7232(%r12), %ymm6 +vpsubw %ymm2, %ymm6, %ymm7 +vmovdqa 8384(%r12), %ymm2 +vpsubw %ymm7, %ymm2, %ymm2 +vpsubw 7808(%r12), %ymm2, %ymm2 +vpsubw %ymm0, %ymm7, %ymm7 +vpaddw %ymm4, %ymm7, %ymm7 +vmovdqa 6944(%r12), %ymm8 +vmovdqa 7808(%r12), %ymm9 +vmovdqa %ymm8, 1024(%rsp) +vmovdqa %ymm0, 1056(%rsp) +vmovdqa %ymm1, 1088(%rsp) +vmovdqa %ymm7, 1120(%rsp) +vmovdqa %ymm5, 1152(%rsp) +vmovdqa %ymm2, 1184(%rsp) +vmovdqa %ymm3, 1216(%rsp) +vmovdqa %ymm9, 1248(%rsp) +vmovdqa 8768(%r12), %ymm0 +vpsubw 8864(%r12), %ymm0, %ymm0 +vmovdqa 9152(%r12), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 8960(%r12), %ymm1, %ymm1 +vpsubw 8672(%r12), %ymm0, %ymm0 +vpaddw 9056(%r12), %ymm0, %ymm0 +vmovdqa 9344(%r12), %ymm2 +vpsubw 9440(%r12), %ymm2, %ymm2 +vmovdqa 9728(%r12), %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw 9536(%r12), %ymm3, %ymm3 +vpsubw 9248(%r12), %ymm2, %ymm2 +vpaddw 9632(%r12), %ymm2, %ymm2 +vmovdqa 9920(%r12), %ymm4 +vpsubw 10016(%r12), %ymm4, %ymm4 +vmovdqa 10304(%r12), %ymm5 +vpsubw %ymm4, %ymm5, %ymm5 +vpsubw 10112(%r12), %ymm5, %ymm5 +vpsubw 9824(%r12), %ymm4, %ymm4 +vpaddw 10208(%r12), %ymm4, %ymm4 +vpsubw 9248(%r12), %ymm1, %ymm1 +vpsubw %ymm1, %ymm5, %ymm5 +vpsubw %ymm3, %ymm5, %ymm5 +vpsubw 8672(%r12), %ymm1, %ymm1 +vpaddw 9824(%r12), %ymm1, %ymm1 +vmovdqa 8960(%r12), %ymm6 +vpsubw %ymm2, %ymm6, %ymm7 +vmovdqa 10112(%r12), %ymm2 +vpsubw %ymm7, %ymm2, %ymm2 +vpsubw 9536(%r12), %ymm2, %ymm2 +vpsubw %ymm0, %ymm7, %ymm7 +vpaddw %ymm4, %ymm7, %ymm7 +vmovdqa 8672(%r12), %ymm8 +vmovdqa 9536(%r12), %ymm9 +vmovdqa %ymm8, 1280(%rsp) +vmovdqa %ymm0, 1312(%rsp) +vmovdqa %ymm1, 1344(%rsp) +vmovdqa %ymm7, 1376(%rsp) +vmovdqa %ymm5, 1408(%rsp) +vmovdqa %ymm2, 1440(%rsp) +vmovdqa %ymm3, 1472(%rsp) +vmovdqa %ymm9, 1504(%rsp) +vmovdqa 10496(%r12), %ymm0 +vpsubw 10592(%r12), %ymm0, %ymm0 +vmovdqa 10880(%r12), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 10688(%r12), %ymm1, %ymm1 +vpsubw 10400(%r12), %ymm0, %ymm0 +vpaddw 10784(%r12), %ymm0, %ymm0 +vmovdqa 11072(%r12), %ymm2 +vpsubw 11168(%r12), %ymm2, %ymm2 +vmovdqa 11456(%r12), %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw 11264(%r12), %ymm3, %ymm3 +vpsubw 10976(%r12), %ymm2, %ymm2 +vpaddw 11360(%r12), %ymm2, %ymm2 +vmovdqa 11648(%r12), %ymm4 +vpsubw 11744(%r12), %ymm4, %ymm4 +vmovdqa 12032(%r12), %ymm5 +vpsubw %ymm4, %ymm5, %ymm5 +vpsubw 11840(%r12), %ymm5, %ymm5 +vpsubw 11552(%r12), %ymm4, %ymm4 +vpaddw 11936(%r12), %ymm4, %ymm4 +vpsubw 10976(%r12), %ymm1, %ymm1 +vpsubw %ymm1, %ymm5, %ymm5 +vpsubw %ymm3, %ymm5, %ymm5 +vpsubw 10400(%r12), %ymm1, %ymm1 +vpaddw 11552(%r12), %ymm1, %ymm1 +vmovdqa 10688(%r12), %ymm6 +vpsubw %ymm2, %ymm6, %ymm7 +vmovdqa 11840(%r12), %ymm2 +vpsubw %ymm7, %ymm2, %ymm2 +vpsubw 11264(%r12), %ymm2, %ymm2 +vpsubw %ymm0, %ymm7, %ymm7 +vpaddw %ymm4, %ymm7, %ymm7 +vmovdqa 10400(%r12), %ymm8 +vmovdqa 11264(%r12), %ymm9 +vmovdqa %ymm8, 1536(%rsp) +vmovdqa %ymm0, 1568(%rsp) +vmovdqa %ymm1, 1600(%rsp) +vmovdqa %ymm7, 1632(%rsp) +vmovdqa %ymm5, 1664(%rsp) +vmovdqa %ymm2, 1696(%rsp) +vmovdqa %ymm3, 1728(%rsp) +vmovdqa %ymm9, 1760(%rsp) +vmovdqa 0(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm8 +vpunpckhwd const0(%rip), %ymm11, %ymm3 +vpslld $1, %ymm8, %ymm8 +vpslld $1, %ymm3, %ymm3 +vmovdqa 256(%rsp), %ymm4 +vpunpcklwd const0(%rip), %ymm4, %ymm10 +vpunpckhwd const0(%rip), %ymm4, %ymm4 +vmovdqa 512(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm5 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm5, %ymm10, %ymm9 +vpaddd %ymm6, %ymm4, %ymm2 +vpsubd %ymm8, %ymm9, %ymm9 +vpsubd %ymm3, %ymm2, %ymm2 +vpsubd %ymm5, %ymm10, %ymm5 +vpsubd %ymm6, %ymm4, %ymm6 +vpsrld $1, %ymm5, %ymm5 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm5, %ymm5 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm5, %ymm6 +vmovdqa 1536(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm4 +vpunpckhwd const0(%rip), %ymm5, %ymm10 +vpslld $1, %ymm4, %ymm4 +vpslld $1, %ymm10, %ymm10 +vpsubd %ymm4, %ymm9, %ymm9 +vpsubd %ymm10, %ymm2, %ymm2 +vpsrld $1, %ymm9, %ymm9 +vpsrld $1, %ymm2, %ymm2 +vpand mask32_to_16(%rip), %ymm9, %ymm9 +vpand mask32_to_16(%rip), %ymm2, %ymm2 +vpackusdw %ymm2, %ymm9, %ymm2 +vmovdqa 768(%rsp), %ymm9 +vpaddw 1024(%rsp), %ymm9, %ymm10 +vpsubw 1024(%rsp), %ymm9, %ymm9 +vpsrlw $2, %ymm9, %ymm9 +vpsubw %ymm6, %ymm9, %ymm9 +vpmullw %ymm14, %ymm9, %ymm9 +vpsllw $1, %ymm11, %ymm4 +vpsubw %ymm4, %ymm10, %ymm4 +vpsllw $7, %ymm5, %ymm10 +vpsubw %ymm10, %ymm4, %ymm10 +vpsrlw $3, %ymm10, %ymm10 +vpsubw %ymm2, %ymm10, %ymm10 +vmovdqa 1280(%rsp), %ymm4 +vpsubw %ymm11, %ymm4, %ymm4 +vpmullw %ymm15, %ymm5, %ymm3 +vpsubw %ymm3, %ymm4, %ymm3 +vpmullw %ymm14, %ymm10, %ymm10 +vpsubw %ymm10, %ymm2, %ymm2 +vpmullw %ymm12, %ymm10, %ymm4 +vpaddw %ymm4, %ymm2, %ymm4 +vpmullw %ymm12, %ymm4, %ymm4 +vpsubw %ymm4, %ymm3, %ymm4 +vpmullw %ymm14, %ymm4, %ymm4 +vpsubw %ymm6, %ymm4, %ymm4 +vpsrlw $3, %ymm4, %ymm4 +vpsubw %ymm9, %ymm4, %ymm4 +vpsubw %ymm4, %ymm9, %ymm9 +vpsubw %ymm9, %ymm6, %ymm6 +vpmullw %ymm13, %ymm4, %ymm4 +vpsubw %ymm4, %ymm6, %ymm6 +vpshufb shuf48_16(%rip), %ymm10, %ymm10 +vpand mask3_5_3_5(%rip), %ymm10, %ymm3 +vpand mask5_3_5_3(%rip), %ymm10, %ymm10 +vpermq $206, %ymm3, %ymm3 +vpand mask_keephigh(%rip), %ymm3, %ymm8 +vpor %ymm8, %ymm10, %ymm10 +vpaddw 2048(%rsp), %ymm11, %ymm11 +vpaddw %ymm10, %ymm11, %ymm11 +vmovdqa %xmm3, 2048(%rsp) +vpshufb shuf48_16(%rip), %ymm4, %ymm4 +vpand mask3_5_3_5(%rip), %ymm4, %ymm3 +vpand mask5_3_5_3(%rip), %ymm4, %ymm4 +vpermq $206, %ymm3, %ymm3 +vpand mask_keephigh(%rip), %ymm3, %ymm8 +vpor %ymm8, %ymm4, %ymm4 +vpaddw 2304(%rsp), %ymm6, %ymm6 +vpaddw %ymm4, %ymm6, %ymm6 +vmovdqa %xmm3, 2304(%rsp) +vpshufb shuf48_16(%rip), %ymm5, %ymm5 +vpand mask3_5_3_5(%rip), %ymm5, %ymm3 +vpand mask5_3_5_3(%rip), %ymm5, %ymm5 +vpermq $206, %ymm3, %ymm3 +vpand mask_keephigh(%rip), %ymm3, %ymm8 +vpor %ymm8, %ymm5, %ymm5 +vpaddw 2560(%rsp), %ymm2, %ymm2 +vpaddw %ymm5, %ymm2, %ymm2 +vmovdqa %xmm3, 2560(%rsp) +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 32(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %ymm6, 384(%rdi) +vpand mask_mod8192(%rip), %ymm2, %ymm2 +vmovdqu %ymm2, 736(%rdi) +vpand mask_mod8192(%rip), %ymm9, %ymm9 +vmovdqu %ymm9, 1088(%rdi) +vmovdqa 32(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm4 +vpunpckhwd const0(%rip), %ymm5, %ymm10 +vpslld $1, %ymm4, %ymm4 +vpslld $1, %ymm10, %ymm10 +vmovdqa 288(%rsp), %ymm9 +vpunpcklwd const0(%rip), %ymm9, %ymm2 +vpunpckhwd const0(%rip), %ymm9, %ymm9 +vmovdqa 544(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm11 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm11, %ymm2, %ymm3 +vpaddd %ymm6, %ymm9, %ymm8 +vpsubd %ymm4, %ymm3, %ymm3 +vpsubd %ymm10, %ymm8, %ymm8 +vpsubd %ymm11, %ymm2, %ymm11 +vpsubd %ymm6, %ymm9, %ymm6 +vpsrld $1, %ymm11, %ymm11 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm11, %ymm11 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm11, %ymm6 +vmovdqa 1568(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm9 +vpunpckhwd const0(%rip), %ymm11, %ymm2 +vpslld $1, %ymm9, %ymm9 +vpslld $1, %ymm2, %ymm2 +vpsubd %ymm9, %ymm3, %ymm3 +vpsubd %ymm2, %ymm8, %ymm8 +vpsrld $1, %ymm3, %ymm3 +vpsrld $1, %ymm8, %ymm8 +vpand mask32_to_16(%rip), %ymm3, %ymm3 +vpand mask32_to_16(%rip), %ymm8, %ymm8 +vpackusdw %ymm8, %ymm3, %ymm8 +vmovdqa 800(%rsp), %ymm3 +vpaddw 1056(%rsp), %ymm3, %ymm2 +vpsubw 1056(%rsp), %ymm3, %ymm3 +vpsrlw $2, %ymm3, %ymm3 +vpsubw %ymm6, %ymm3, %ymm3 +vpmullw %ymm14, %ymm3, %ymm3 +vpsllw $1, %ymm5, %ymm9 +vpsubw %ymm9, %ymm2, %ymm9 +vpsllw $7, %ymm11, %ymm2 +vpsubw %ymm2, %ymm9, %ymm2 +vpsrlw $3, %ymm2, %ymm2 +vpsubw %ymm8, %ymm2, %ymm2 +vmovdqa 1312(%rsp), %ymm9 +vpsubw %ymm5, %ymm9, %ymm9 +vpmullw %ymm15, %ymm11, %ymm10 +vpsubw %ymm10, %ymm9, %ymm10 +vpmullw %ymm14, %ymm2, %ymm2 +vpsubw %ymm2, %ymm8, %ymm8 +vpmullw %ymm12, %ymm2, %ymm9 +vpaddw %ymm9, %ymm8, %ymm9 +vpmullw %ymm12, %ymm9, %ymm9 +vpsubw %ymm9, %ymm10, %ymm9 +vpmullw %ymm14, %ymm9, %ymm9 +vpsubw %ymm6, %ymm9, %ymm9 +vpsrlw $3, %ymm9, %ymm9 +vpsubw %ymm3, %ymm9, %ymm9 +vpsubw %ymm9, %ymm3, %ymm3 +vpsubw %ymm3, %ymm6, %ymm6 +vpmullw %ymm13, %ymm9, %ymm9 +vpsubw %ymm9, %ymm6, %ymm6 +vpshufb shuf48_16(%rip), %ymm2, %ymm2 +vpand mask3_5_3_5(%rip), %ymm2, %ymm10 +vpand mask5_3_5_3(%rip), %ymm2, %ymm2 +vpermq $206, %ymm10, %ymm10 +vpand mask_keephigh(%rip), %ymm10, %ymm4 +vpor %ymm4, %ymm2, %ymm2 +vpaddw 2080(%rsp), %ymm5, %ymm5 +vpaddw %ymm2, %ymm5, %ymm5 +vmovdqa %xmm10, 2080(%rsp) +vpshufb shuf48_16(%rip), %ymm9, %ymm9 +vpand mask3_5_3_5(%rip), %ymm9, %ymm10 +vpand mask5_3_5_3(%rip), %ymm9, %ymm9 +vpermq $206, %ymm10, %ymm10 +vpand mask_keephigh(%rip), %ymm10, %ymm4 +vpor %ymm4, %ymm9, %ymm9 +vpaddw 2336(%rsp), %ymm6, %ymm6 +vpaddw %ymm9, %ymm6, %ymm6 +vmovdqa %xmm10, 2336(%rsp) +vpshufb shuf48_16(%rip), %ymm11, %ymm11 +vpand mask3_5_3_5(%rip), %ymm11, %ymm10 +vpand mask5_3_5_3(%rip), %ymm11, %ymm11 +vpermq $206, %ymm10, %ymm10 +vpand mask_keephigh(%rip), %ymm10, %ymm4 +vpor %ymm4, %ymm11, %ymm11 +vpaddw 2592(%rsp), %ymm8, %ymm8 +vpaddw %ymm11, %ymm8, %ymm8 +vmovdqa %xmm10, 2592(%rsp) +vpand mask_mod8192(%rip), %ymm5, %ymm5 +vmovdqu %ymm5, 120(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %ymm6, 472(%rdi) +vpand mask_mod8192(%rip), %ymm8, %ymm8 +vmovdqu %ymm8, 824(%rdi) +vpand mask_mod8192(%rip), %ymm3, %ymm3 +vmovdqu %ymm3, 1176(%rdi) +vmovdqa 64(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm9 +vpunpckhwd const0(%rip), %ymm11, %ymm2 +vpslld $1, %ymm9, %ymm9 +vpslld $1, %ymm2, %ymm2 +vmovdqa 320(%rsp), %ymm3 +vpunpcklwd const0(%rip), %ymm3, %ymm8 +vpunpckhwd const0(%rip), %ymm3, %ymm3 +vmovdqa 576(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm5 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm5, %ymm8, %ymm10 +vpaddd %ymm6, %ymm3, %ymm4 +vpsubd %ymm9, %ymm10, %ymm10 +vpsubd %ymm2, %ymm4, %ymm4 +vpsubd %ymm5, %ymm8, %ymm5 +vpsubd %ymm6, %ymm3, %ymm6 +vpsrld $1, %ymm5, %ymm5 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm5, %ymm5 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm5, %ymm6 +vmovdqa 1600(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm3 +vpunpckhwd const0(%rip), %ymm5, %ymm8 +vpslld $1, %ymm3, %ymm3 +vpslld $1, %ymm8, %ymm8 +vpsubd %ymm3, %ymm10, %ymm10 +vpsubd %ymm8, %ymm4, %ymm4 +vpsrld $1, %ymm10, %ymm10 +vpsrld $1, %ymm4, %ymm4 +vpand mask32_to_16(%rip), %ymm10, %ymm10 +vpand mask32_to_16(%rip), %ymm4, %ymm4 +vpackusdw %ymm4, %ymm10, %ymm4 +vmovdqa 832(%rsp), %ymm10 +vpaddw 1088(%rsp), %ymm10, %ymm8 +vpsubw 1088(%rsp), %ymm10, %ymm10 +vpsrlw $2, %ymm10, %ymm10 +vpsubw %ymm6, %ymm10, %ymm10 +vpmullw %ymm14, %ymm10, %ymm10 +vpsllw $1, %ymm11, %ymm3 +vpsubw %ymm3, %ymm8, %ymm3 +vpsllw $7, %ymm5, %ymm8 +vpsubw %ymm8, %ymm3, %ymm8 +vpsrlw $3, %ymm8, %ymm8 +vpsubw %ymm4, %ymm8, %ymm8 +vmovdqa 1344(%rsp), %ymm3 +vpsubw %ymm11, %ymm3, %ymm3 +vpmullw %ymm15, %ymm5, %ymm2 +vpsubw %ymm2, %ymm3, %ymm2 +vpmullw %ymm14, %ymm8, %ymm8 +vpsubw %ymm8, %ymm4, %ymm4 +vpmullw %ymm12, %ymm8, %ymm3 +vpaddw %ymm3, %ymm4, %ymm3 +vpmullw %ymm12, %ymm3, %ymm3 +vpsubw %ymm3, %ymm2, %ymm3 +vpmullw %ymm14, %ymm3, %ymm3 +vpsubw %ymm6, %ymm3, %ymm3 +vpsrlw $3, %ymm3, %ymm3 +vpsubw %ymm10, %ymm3, %ymm3 +vpsubw %ymm3, %ymm10, %ymm10 +vpsubw %ymm10, %ymm6, %ymm6 +vpmullw %ymm13, %ymm3, %ymm3 +vpsubw %ymm3, %ymm6, %ymm6 +vpshufb shuf48_16(%rip), %ymm8, %ymm8 +vpand mask3_5_3_5(%rip), %ymm8, %ymm2 +vpand mask5_3_5_3(%rip), %ymm8, %ymm8 +vpermq $206, %ymm2, %ymm2 +vpand mask_keephigh(%rip), %ymm2, %ymm9 +vpor %ymm9, %ymm8, %ymm8 +vpaddw 2112(%rsp), %ymm11, %ymm11 +vpaddw %ymm8, %ymm11, %ymm11 +vmovdqa %xmm2, 2112(%rsp) +vpshufb shuf48_16(%rip), %ymm3, %ymm3 +vpand mask3_5_3_5(%rip), %ymm3, %ymm2 +vpand mask5_3_5_3(%rip), %ymm3, %ymm3 +vpermq $206, %ymm2, %ymm2 +vpand mask_keephigh(%rip), %ymm2, %ymm9 +vpor %ymm9, %ymm3, %ymm3 +vpaddw 2368(%rsp), %ymm6, %ymm6 +vpaddw %ymm3, %ymm6, %ymm6 +vmovdqa %xmm2, 2368(%rsp) +vpshufb shuf48_16(%rip), %ymm5, %ymm5 +vpand mask3_5_3_5(%rip), %ymm5, %ymm2 +vpand mask5_3_5_3(%rip), %ymm5, %ymm5 +vpermq $206, %ymm2, %ymm2 +vpand mask_keephigh(%rip), %ymm2, %ymm9 +vpor %ymm9, %ymm5, %ymm5 +vpaddw 2624(%rsp), %ymm4, %ymm4 +vpaddw %ymm5, %ymm4, %ymm4 +vmovdqa %xmm2, 2624(%rsp) +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 208(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %ymm6, 560(%rdi) +vpand mask_mod8192(%rip), %ymm4, %ymm4 +vmovdqu %ymm4, 912(%rdi) +vpand mask_mod8192(%rip), %ymm10, %ymm10 +vmovdqu %ymm10, 1264(%rdi) +vmovdqa 96(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm3 +vpunpckhwd const0(%rip), %ymm5, %ymm8 +vpslld $1, %ymm3, %ymm3 +vpslld $1, %ymm8, %ymm8 +vmovdqa 352(%rsp), %ymm10 +vpunpcklwd const0(%rip), %ymm10, %ymm4 +vpunpckhwd const0(%rip), %ymm10, %ymm10 +vmovdqa 608(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm11 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm11, %ymm4, %ymm2 +vpaddd %ymm6, %ymm10, %ymm9 +vpsubd %ymm3, %ymm2, %ymm2 +vpsubd %ymm8, %ymm9, %ymm9 +vpsubd %ymm11, %ymm4, %ymm11 +vpsubd %ymm6, %ymm10, %ymm6 +vpsrld $1, %ymm11, %ymm11 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm11, %ymm11 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm11, %ymm6 +vmovdqa 1632(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm10 +vpunpckhwd const0(%rip), %ymm11, %ymm4 +vpslld $1, %ymm10, %ymm10 +vpslld $1, %ymm4, %ymm4 +vpsubd %ymm10, %ymm2, %ymm2 +vpsubd %ymm4, %ymm9, %ymm9 +vpsrld $1, %ymm2, %ymm2 +vpsrld $1, %ymm9, %ymm9 +vpand mask32_to_16(%rip), %ymm2, %ymm2 +vpand mask32_to_16(%rip), %ymm9, %ymm9 +vpackusdw %ymm9, %ymm2, %ymm9 +vmovdqa 864(%rsp), %ymm2 +vpaddw 1120(%rsp), %ymm2, %ymm4 +vpsubw 1120(%rsp), %ymm2, %ymm2 +vpsrlw $2, %ymm2, %ymm2 +vpsubw %ymm6, %ymm2, %ymm2 +vpmullw %ymm14, %ymm2, %ymm2 +vpsllw $1, %ymm5, %ymm10 +vpsubw %ymm10, %ymm4, %ymm10 +vpsllw $7, %ymm11, %ymm4 +vpsubw %ymm4, %ymm10, %ymm4 +vpsrlw $3, %ymm4, %ymm4 +vpsubw %ymm9, %ymm4, %ymm4 +vmovdqa 1376(%rsp), %ymm10 +vpsubw %ymm5, %ymm10, %ymm10 +vpmullw %ymm15, %ymm11, %ymm8 +vpsubw %ymm8, %ymm10, %ymm8 +vpmullw %ymm14, %ymm4, %ymm4 +vpsubw %ymm4, %ymm9, %ymm9 +vpmullw %ymm12, %ymm4, %ymm10 +vpaddw %ymm10, %ymm9, %ymm10 +vpmullw %ymm12, %ymm10, %ymm10 +vpsubw %ymm10, %ymm8, %ymm10 +vpmullw %ymm14, %ymm10, %ymm10 +vpsubw %ymm6, %ymm10, %ymm10 +vpsrlw $3, %ymm10, %ymm10 +vpsubw %ymm2, %ymm10, %ymm10 +vpsubw %ymm10, %ymm2, %ymm2 +vpsubw %ymm2, %ymm6, %ymm6 +vpmullw %ymm13, %ymm10, %ymm10 +vpsubw %ymm10, %ymm6, %ymm6 +vpshufb shuf48_16(%rip), %ymm4, %ymm4 +vpand mask3_5_3_5(%rip), %ymm4, %ymm8 +vpand mask5_3_5_3(%rip), %ymm4, %ymm4 +vpermq $206, %ymm8, %ymm8 +vpand mask_keephigh(%rip), %ymm8, %ymm3 +vpor %ymm3, %ymm4, %ymm4 +vpaddw 2144(%rsp), %ymm5, %ymm5 +vpaddw %ymm4, %ymm5, %ymm5 +vmovdqa %xmm8, 2144(%rsp) +vpshufb shuf48_16(%rip), %ymm10, %ymm10 +vpand mask3_5_3_5(%rip), %ymm10, %ymm8 +vpand mask5_3_5_3(%rip), %ymm10, %ymm10 +vpermq $206, %ymm8, %ymm8 +vpand mask_keephigh(%rip), %ymm8, %ymm3 +vpor %ymm3, %ymm10, %ymm10 +vpaddw 2400(%rsp), %ymm6, %ymm6 +vpaddw %ymm10, %ymm6, %ymm6 +vmovdqa %xmm8, 2400(%rsp) +vpshufb shuf48_16(%rip), %ymm11, %ymm11 +vpand mask3_5_3_5(%rip), %ymm11, %ymm8 +vpand mask5_3_5_3(%rip), %ymm11, %ymm11 +vpermq $206, %ymm8, %ymm8 +vpand mask_keephigh(%rip), %ymm8, %ymm3 +vpor %ymm3, %ymm11, %ymm11 +vpaddw 2656(%rsp), %ymm9, %ymm9 +vpaddw %ymm11, %ymm9, %ymm9 +vmovdqa %xmm8, 2656(%rsp) +vpand mask_mod8192(%rip), %ymm5, %ymm5 +vmovdqu %ymm5, 296(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %ymm6, 648(%rdi) +vpand mask_mod8192(%rip), %ymm9, %ymm9 +vmovdqu %ymm9, 1000(%rdi) +vpand mask_mod8192(%rip), %ymm2, %ymm2 +vmovdqu %ymm2, 1352(%rdi) +vmovdqa 128(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm10 +vpunpckhwd const0(%rip), %ymm11, %ymm4 +vpslld $1, %ymm10, %ymm10 +vpslld $1, %ymm4, %ymm4 +vmovdqa 384(%rsp), %ymm2 +vpunpcklwd const0(%rip), %ymm2, %ymm9 +vpunpckhwd const0(%rip), %ymm2, %ymm2 +vmovdqa 640(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm5 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm5, %ymm9, %ymm8 +vpaddd %ymm6, %ymm2, %ymm3 +vpsubd %ymm10, %ymm8, %ymm8 +vpsubd %ymm4, %ymm3, %ymm3 +vpsubd %ymm5, %ymm9, %ymm5 +vpsubd %ymm6, %ymm2, %ymm6 +vpsrld $1, %ymm5, %ymm5 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm5, %ymm5 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm5, %ymm6 +vmovdqa 1664(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm2 +vpunpckhwd const0(%rip), %ymm5, %ymm9 +vpslld $1, %ymm2, %ymm2 +vpslld $1, %ymm9, %ymm9 +vpsubd %ymm2, %ymm8, %ymm8 +vpsubd %ymm9, %ymm3, %ymm3 +vpsrld $1, %ymm8, %ymm8 +vpsrld $1, %ymm3, %ymm3 +vpand mask32_to_16(%rip), %ymm8, %ymm8 +vpand mask32_to_16(%rip), %ymm3, %ymm3 +vpackusdw %ymm3, %ymm8, %ymm3 +vmovdqa 896(%rsp), %ymm8 +vpaddw 1152(%rsp), %ymm8, %ymm9 +vpsubw 1152(%rsp), %ymm8, %ymm8 +vpsrlw $2, %ymm8, %ymm8 +vpsubw %ymm6, %ymm8, %ymm8 +vpmullw %ymm14, %ymm8, %ymm8 +vpsllw $1, %ymm11, %ymm2 +vpsubw %ymm2, %ymm9, %ymm2 +vpsllw $7, %ymm5, %ymm9 +vpsubw %ymm9, %ymm2, %ymm9 +vpsrlw $3, %ymm9, %ymm9 +vpsubw %ymm3, %ymm9, %ymm9 +vmovdqa 1408(%rsp), %ymm2 +vpsubw %ymm11, %ymm2, %ymm2 +vpmullw %ymm15, %ymm5, %ymm4 +vpsubw %ymm4, %ymm2, %ymm4 +vpmullw %ymm14, %ymm9, %ymm9 +vpsubw %ymm9, %ymm3, %ymm3 +vpmullw %ymm12, %ymm9, %ymm2 +vpaddw %ymm2, %ymm3, %ymm2 +vpmullw %ymm12, %ymm2, %ymm2 +vpsubw %ymm2, %ymm4, %ymm2 +vpmullw %ymm14, %ymm2, %ymm2 +vpsubw %ymm6, %ymm2, %ymm2 +vpsrlw $3, %ymm2, %ymm2 +vpsubw %ymm8, %ymm2, %ymm2 +vpsubw %ymm2, %ymm8, %ymm8 +vpsubw %ymm8, %ymm6, %ymm6 +vpmullw %ymm13, %ymm2, %ymm2 +vpsubw %ymm2, %ymm6, %ymm6 +vmovdqu 384(%rdi), %ymm4 +vmovdqu 736(%rdi), %ymm10 +vmovdqu 1088(%rdi), %ymm7 +vpaddw %ymm11, %ymm4, %ymm11 +vpaddw %ymm6, %ymm10, %ymm6 +vpaddw %ymm3, %ymm7, %ymm3 +vpshufb shuf48_16(%rip), %ymm8, %ymm8 +vpand mask3_5_3_5(%rip), %ymm8, %ymm7 +vpand mask5_3_5_3(%rip), %ymm8, %ymm8 +vpermq $206, %ymm7, %ymm7 +vpand mask_keephigh(%rip), %ymm7, %ymm10 +vpor %ymm10, %ymm8, %ymm8 +vmovdqu 32(%rdi), %ymm10 +vpaddw 1920(%rsp), %ymm10, %ymm10 +vpaddw %ymm8, %ymm10, %ymm10 +vpand mask_mod8192(%rip), %ymm10, %ymm10 +vmovdqu %ymm10, 32(%rdi) +vmovdqa %xmm7, 1920(%rsp) +vpshufb shuf48_16(%rip), %ymm9, %ymm9 +vpand mask3_5_3_5(%rip), %ymm9, %ymm7 +vpand mask5_3_5_3(%rip), %ymm9, %ymm9 +vpermq $206, %ymm7, %ymm7 +vpand mask_keephigh(%rip), %ymm7, %ymm10 +vpor %ymm10, %ymm9, %ymm9 +vpaddw 2176(%rsp), %ymm11, %ymm11 +vpaddw %ymm9, %ymm11, %ymm11 +vmovdqa %xmm7, 2176(%rsp) +vpshufb shuf48_16(%rip), %ymm2, %ymm2 +vpand mask3_5_3_5(%rip), %ymm2, %ymm7 +vpand mask5_3_5_3(%rip), %ymm2, %ymm2 +vpermq $206, %ymm7, %ymm7 +vpand mask_keephigh(%rip), %ymm7, %ymm10 +vpor %ymm10, %ymm2, %ymm2 +vpaddw 2432(%rsp), %ymm6, %ymm6 +vpaddw %ymm2, %ymm6, %ymm6 +vmovdqa %xmm7, 2432(%rsp) +vpshufb shuf48_16(%rip), %ymm5, %ymm5 +vpand mask3_5_3_5(%rip), %ymm5, %ymm7 +vpand mask5_3_5_3(%rip), %ymm5, %ymm5 +vpermq $206, %ymm7, %ymm7 +vpand mask_keephigh(%rip), %ymm7, %ymm10 +vpor %ymm10, %ymm5, %ymm5 +vpaddw 2688(%rsp), %ymm3, %ymm3 +vpaddw %ymm5, %ymm3, %ymm3 +vmovdqa %xmm7, 2688(%rsp) +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 384(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %ymm6, 736(%rdi) +vpand mask_mod8192(%rip), %ymm3, %ymm3 +vmovdqu %ymm3, 1088(%rdi) +vmovdqa 160(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm2 +vpunpckhwd const0(%rip), %ymm5, %ymm9 +vpslld $1, %ymm2, %ymm2 +vpslld $1, %ymm9, %ymm9 +vmovdqa 416(%rsp), %ymm8 +vpunpcklwd const0(%rip), %ymm8, %ymm3 +vpunpckhwd const0(%rip), %ymm8, %ymm8 +vmovdqa 672(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm11 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm11, %ymm3, %ymm7 +vpaddd %ymm6, %ymm8, %ymm10 +vpsubd %ymm2, %ymm7, %ymm7 +vpsubd %ymm9, %ymm10, %ymm10 +vpsubd %ymm11, %ymm3, %ymm11 +vpsubd %ymm6, %ymm8, %ymm6 +vpsrld $1, %ymm11, %ymm11 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm11, %ymm11 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm11, %ymm6 +vmovdqa 1696(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm8 +vpunpckhwd const0(%rip), %ymm11, %ymm3 +vpslld $1, %ymm8, %ymm8 +vpslld $1, %ymm3, %ymm3 +vpsubd %ymm8, %ymm7, %ymm7 +vpsubd %ymm3, %ymm10, %ymm10 +vpsrld $1, %ymm7, %ymm7 +vpsrld $1, %ymm10, %ymm10 +vpand mask32_to_16(%rip), %ymm7, %ymm7 +vpand mask32_to_16(%rip), %ymm10, %ymm10 +vpackusdw %ymm10, %ymm7, %ymm10 +vmovdqa 928(%rsp), %ymm7 +vpaddw 1184(%rsp), %ymm7, %ymm3 +vpsubw 1184(%rsp), %ymm7, %ymm7 +vpsrlw $2, %ymm7, %ymm7 +vpsubw %ymm6, %ymm7, %ymm7 +vpmullw %ymm14, %ymm7, %ymm7 +vpsllw $1, %ymm5, %ymm8 +vpsubw %ymm8, %ymm3, %ymm8 +vpsllw $7, %ymm11, %ymm3 +vpsubw %ymm3, %ymm8, %ymm3 +vpsrlw $3, %ymm3, %ymm3 +vpsubw %ymm10, %ymm3, %ymm3 +vmovdqa 1440(%rsp), %ymm8 +vpsubw %ymm5, %ymm8, %ymm8 +vpmullw %ymm15, %ymm11, %ymm9 +vpsubw %ymm9, %ymm8, %ymm9 +vpmullw %ymm14, %ymm3, %ymm3 +vpsubw %ymm3, %ymm10, %ymm10 +vpmullw %ymm12, %ymm3, %ymm8 +vpaddw %ymm8, %ymm10, %ymm8 +vpmullw %ymm12, %ymm8, %ymm8 +vpsubw %ymm8, %ymm9, %ymm8 +vpmullw %ymm14, %ymm8, %ymm8 +vpsubw %ymm6, %ymm8, %ymm8 +vpsrlw $3, %ymm8, %ymm8 +vpsubw %ymm7, %ymm8, %ymm8 +vpsubw %ymm8, %ymm7, %ymm7 +vpsubw %ymm7, %ymm6, %ymm6 +vpmullw %ymm13, %ymm8, %ymm8 +vpsubw %ymm8, %ymm6, %ymm6 +vmovdqu 472(%rdi), %ymm9 +vmovdqu 824(%rdi), %ymm2 +vmovdqu 1176(%rdi), %ymm4 +vpaddw %ymm5, %ymm9, %ymm5 +vpaddw %ymm6, %ymm2, %ymm6 +vpaddw %ymm10, %ymm4, %ymm10 +vpshufb shuf48_16(%rip), %ymm7, %ymm7 +vpand mask3_5_3_5(%rip), %ymm7, %ymm4 +vpand mask5_3_5_3(%rip), %ymm7, %ymm7 +vpermq $206, %ymm4, %ymm4 +vpand mask_keephigh(%rip), %ymm4, %ymm2 +vpor %ymm2, %ymm7, %ymm7 +vmovdqu 120(%rdi), %ymm2 +vpaddw 1952(%rsp), %ymm2, %ymm2 +vpaddw %ymm7, %ymm2, %ymm2 +vpand mask_mod8192(%rip), %ymm2, %ymm2 +vmovdqu %ymm2, 120(%rdi) +vmovdqa %xmm4, 1952(%rsp) +vpshufb shuf48_16(%rip), %ymm3, %ymm3 +vpand mask3_5_3_5(%rip), %ymm3, %ymm4 +vpand mask5_3_5_3(%rip), %ymm3, %ymm3 +vpermq $206, %ymm4, %ymm4 +vpand mask_keephigh(%rip), %ymm4, %ymm2 +vpor %ymm2, %ymm3, %ymm3 +vpaddw 2208(%rsp), %ymm5, %ymm5 +vpaddw %ymm3, %ymm5, %ymm5 +vmovdqa %xmm4, 2208(%rsp) +vpshufb shuf48_16(%rip), %ymm8, %ymm8 +vpand mask3_5_3_5(%rip), %ymm8, %ymm4 +vpand mask5_3_5_3(%rip), %ymm8, %ymm8 +vpermq $206, %ymm4, %ymm4 +vpand mask_keephigh(%rip), %ymm4, %ymm2 +vpor %ymm2, %ymm8, %ymm8 +vpaddw 2464(%rsp), %ymm6, %ymm6 +vpaddw %ymm8, %ymm6, %ymm6 +vmovdqa %xmm4, 2464(%rsp) +vpshufb shuf48_16(%rip), %ymm11, %ymm11 +vpand mask3_5_3_5(%rip), %ymm11, %ymm4 +vpand mask5_3_5_3(%rip), %ymm11, %ymm11 +vpermq $206, %ymm4, %ymm4 +vpand mask_keephigh(%rip), %ymm4, %ymm2 +vpor %ymm2, %ymm11, %ymm11 +vpaddw 2720(%rsp), %ymm10, %ymm10 +vpaddw %ymm11, %ymm10, %ymm10 +vmovdqa %xmm4, 2720(%rsp) +vpand mask_mod8192(%rip), %ymm5, %ymm5 +vmovdqu %ymm5, 472(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %ymm6, 824(%rdi) +vpand mask_mod8192(%rip), %ymm10, %ymm10 +vmovdqu %ymm10, 1176(%rdi) +vmovdqa 192(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm8 +vpunpckhwd const0(%rip), %ymm11, %ymm3 +vpslld $1, %ymm8, %ymm8 +vpslld $1, %ymm3, %ymm3 +vmovdqa 448(%rsp), %ymm7 +vpunpcklwd const0(%rip), %ymm7, %ymm10 +vpunpckhwd const0(%rip), %ymm7, %ymm7 +vmovdqa 704(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm5 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm5, %ymm10, %ymm4 +vpaddd %ymm6, %ymm7, %ymm2 +vpsubd %ymm8, %ymm4, %ymm4 +vpsubd %ymm3, %ymm2, %ymm2 +vpsubd %ymm5, %ymm10, %ymm5 +vpsubd %ymm6, %ymm7, %ymm6 +vpsrld $1, %ymm5, %ymm5 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm5, %ymm5 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm5, %ymm6 +vmovdqa 1728(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm7 +vpunpckhwd const0(%rip), %ymm5, %ymm10 +vpslld $1, %ymm7, %ymm7 +vpslld $1, %ymm10, %ymm10 +vpsubd %ymm7, %ymm4, %ymm4 +vpsubd %ymm10, %ymm2, %ymm2 +vpsrld $1, %ymm4, %ymm4 +vpsrld $1, %ymm2, %ymm2 +vpand mask32_to_16(%rip), %ymm4, %ymm4 +vpand mask32_to_16(%rip), %ymm2, %ymm2 +vpackusdw %ymm2, %ymm4, %ymm2 +vmovdqa 960(%rsp), %ymm4 +vpaddw 1216(%rsp), %ymm4, %ymm10 +vpsubw 1216(%rsp), %ymm4, %ymm4 +vpsrlw $2, %ymm4, %ymm4 +vpsubw %ymm6, %ymm4, %ymm4 +vpmullw %ymm14, %ymm4, %ymm4 +vpsllw $1, %ymm11, %ymm7 +vpsubw %ymm7, %ymm10, %ymm7 +vpsllw $7, %ymm5, %ymm10 +vpsubw %ymm10, %ymm7, %ymm10 +vpsrlw $3, %ymm10, %ymm10 +vpsubw %ymm2, %ymm10, %ymm10 +vmovdqa 1472(%rsp), %ymm7 +vpsubw %ymm11, %ymm7, %ymm7 +vpmullw %ymm15, %ymm5, %ymm3 +vpsubw %ymm3, %ymm7, %ymm3 +vpmullw %ymm14, %ymm10, %ymm10 +vpsubw %ymm10, %ymm2, %ymm2 +vpmullw %ymm12, %ymm10, %ymm7 +vpaddw %ymm7, %ymm2, %ymm7 +vpmullw %ymm12, %ymm7, %ymm7 +vpsubw %ymm7, %ymm3, %ymm7 +vpmullw %ymm14, %ymm7, %ymm7 +vpsubw %ymm6, %ymm7, %ymm7 +vpsrlw $3, %ymm7, %ymm7 +vpsubw %ymm4, %ymm7, %ymm7 +vpsubw %ymm7, %ymm4, %ymm4 +vpsubw %ymm4, %ymm6, %ymm6 +vpmullw %ymm13, %ymm7, %ymm7 +vpsubw %ymm7, %ymm6, %ymm6 +vmovdqu 560(%rdi), %ymm3 +vmovdqu 912(%rdi), %ymm8 +vmovdqu 1264(%rdi), %ymm9 +vpaddw %ymm11, %ymm3, %ymm11 +vpaddw %ymm6, %ymm8, %ymm6 +vpaddw %ymm2, %ymm9, %ymm2 +vpshufb shuf48_16(%rip), %ymm4, %ymm4 +vpand mask3_5_3_5(%rip), %ymm4, %ymm9 +vpand mask5_3_5_3(%rip), %ymm4, %ymm4 +vpermq $206, %ymm9, %ymm9 +vpand mask_keephigh(%rip), %ymm9, %ymm8 +vpor %ymm8, %ymm4, %ymm4 +vmovdqu 208(%rdi), %ymm8 +vpaddw 1984(%rsp), %ymm8, %ymm8 +vpaddw %ymm4, %ymm8, %ymm8 +vpand mask_mod8192(%rip), %ymm8, %ymm8 +vmovdqu %ymm8, 208(%rdi) +vmovdqa %xmm9, 1984(%rsp) +vpshufb shuf48_16(%rip), %ymm10, %ymm10 +vpand mask3_5_3_5(%rip), %ymm10, %ymm9 +vpand mask5_3_5_3(%rip), %ymm10, %ymm10 +vpermq $206, %ymm9, %ymm9 +vpand mask_keephigh(%rip), %ymm9, %ymm8 +vpor %ymm8, %ymm10, %ymm10 +vpaddw 2240(%rsp), %ymm11, %ymm11 +vpaddw %ymm10, %ymm11, %ymm11 +vmovdqa %xmm9, 2240(%rsp) +vpshufb shuf48_16(%rip), %ymm7, %ymm7 +vpand mask3_5_3_5(%rip), %ymm7, %ymm9 +vpand mask5_3_5_3(%rip), %ymm7, %ymm7 +vpermq $206, %ymm9, %ymm9 +vpand mask_keephigh(%rip), %ymm9, %ymm8 +vpor %ymm8, %ymm7, %ymm7 +vpaddw 2496(%rsp), %ymm6, %ymm6 +vpaddw %ymm7, %ymm6, %ymm6 +vmovdqa %xmm9, 2496(%rsp) +vpshufb shuf48_16(%rip), %ymm5, %ymm5 +vpand mask3_5_3_5(%rip), %ymm5, %ymm9 +vpand mask5_3_5_3(%rip), %ymm5, %ymm5 +vpermq $206, %ymm9, %ymm9 +vpand mask_keephigh(%rip), %ymm9, %ymm8 +vpor %ymm8, %ymm5, %ymm5 +vpaddw 2752(%rsp), %ymm2, %ymm2 +vpaddw %ymm5, %ymm2, %ymm2 +vmovdqa %xmm9, 2752(%rsp) +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 560(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %ymm6, 912(%rdi) +vpand mask_mod8192(%rip), %ymm2, %ymm2 +vmovdqu %ymm2, 1264(%rdi) +vmovdqa 224(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm7 +vpunpckhwd const0(%rip), %ymm5, %ymm10 +vpslld $1, %ymm7, %ymm7 +vpslld $1, %ymm10, %ymm10 +vmovdqa 480(%rsp), %ymm4 +vpunpcklwd const0(%rip), %ymm4, %ymm2 +vpunpckhwd const0(%rip), %ymm4, %ymm4 +vmovdqa 736(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm11 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm11, %ymm2, %ymm9 +vpaddd %ymm6, %ymm4, %ymm8 +vpsubd %ymm7, %ymm9, %ymm9 +vpsubd %ymm10, %ymm8, %ymm8 +vpsubd %ymm11, %ymm2, %ymm11 +vpsubd %ymm6, %ymm4, %ymm6 +vpsrld $1, %ymm11, %ymm11 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm11, %ymm11 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm11, %ymm6 +vmovdqa 1760(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm4 +vpunpckhwd const0(%rip), %ymm11, %ymm2 +vpslld $1, %ymm4, %ymm4 +vpslld $1, %ymm2, %ymm2 +vpsubd %ymm4, %ymm9, %ymm9 +vpsubd %ymm2, %ymm8, %ymm8 +vpsrld $1, %ymm9, %ymm9 +vpsrld $1, %ymm8, %ymm8 +vpand mask32_to_16(%rip), %ymm9, %ymm9 +vpand mask32_to_16(%rip), %ymm8, %ymm8 +vpackusdw %ymm8, %ymm9, %ymm8 +vmovdqa 992(%rsp), %ymm9 +vpaddw 1248(%rsp), %ymm9, %ymm2 +vpsubw 1248(%rsp), %ymm9, %ymm9 +vpsrlw $2, %ymm9, %ymm9 +vpsubw %ymm6, %ymm9, %ymm9 +vpmullw %ymm14, %ymm9, %ymm9 +vpsllw $1, %ymm5, %ymm4 +vpsubw %ymm4, %ymm2, %ymm4 +vpsllw $7, %ymm11, %ymm2 +vpsubw %ymm2, %ymm4, %ymm2 +vpsrlw $3, %ymm2, %ymm2 +vpsubw %ymm8, %ymm2, %ymm2 +vmovdqa 1504(%rsp), %ymm4 +vpsubw %ymm5, %ymm4, %ymm4 +vpmullw %ymm15, %ymm11, %ymm10 +vpsubw %ymm10, %ymm4, %ymm10 +vpmullw %ymm14, %ymm2, %ymm2 +vpsubw %ymm2, %ymm8, %ymm8 +vpmullw %ymm12, %ymm2, %ymm4 +vpaddw %ymm4, %ymm8, %ymm4 +vpmullw %ymm12, %ymm4, %ymm4 +vpsubw %ymm4, %ymm10, %ymm4 +vpmullw %ymm14, %ymm4, %ymm4 +vpsubw %ymm6, %ymm4, %ymm4 +vpsrlw $3, %ymm4, %ymm4 +vpsubw %ymm9, %ymm4, %ymm4 +vpsubw %ymm4, %ymm9, %ymm9 +vpsubw %ymm9, %ymm6, %ymm6 +vpmullw %ymm13, %ymm4, %ymm4 +vpsubw %ymm4, %ymm6, %ymm6 +vmovdqu 648(%rdi), %ymm10 +vmovdqu 1000(%rdi), %ymm7 +vmovdqu 1352(%rdi), %ymm3 +vpaddw %ymm5, %ymm10, %ymm5 +vpaddw %ymm6, %ymm7, %ymm6 +vpaddw %ymm8, %ymm3, %ymm8 +vpshufb shuf48_16(%rip), %ymm9, %ymm9 +vpand mask3_5_3_5(%rip), %ymm9, %ymm3 +vpand mask5_3_5_3(%rip), %ymm9, %ymm9 +vpermq $206, %ymm3, %ymm3 +vpand mask_keephigh(%rip), %ymm3, %ymm7 +vpor %ymm7, %ymm9, %ymm9 +vmovdqu 296(%rdi), %ymm7 +vpaddw 2016(%rsp), %ymm7, %ymm7 +vpaddw %ymm9, %ymm7, %ymm7 +vpand mask_mod8192(%rip), %ymm7, %ymm7 +vmovdqu %ymm7, 296(%rdi) +vmovdqa %xmm3, 2016(%rsp) +vpshufb shuf48_16(%rip), %ymm2, %ymm2 +vpand mask3_5_3_5(%rip), %ymm2, %ymm3 +vpand mask5_3_5_3(%rip), %ymm2, %ymm2 +vpermq $206, %ymm3, %ymm3 +vpand mask_keephigh(%rip), %ymm3, %ymm7 +vpor %ymm7, %ymm2, %ymm2 +vpaddw 2272(%rsp), %ymm5, %ymm5 +vpaddw %ymm2, %ymm5, %ymm5 +vmovdqa %xmm3, 2272(%rsp) +vpshufb shuf48_16(%rip), %ymm4, %ymm4 +vpand mask3_5_3_5(%rip), %ymm4, %ymm3 +vpand mask5_3_5_3(%rip), %ymm4, %ymm4 +vpermq $206, %ymm3, %ymm3 +vpand mask_keephigh(%rip), %ymm3, %ymm7 +vpor %ymm7, %ymm4, %ymm4 +vpaddw 2528(%rsp), %ymm6, %ymm6 +vpaddw %ymm4, %ymm6, %ymm6 +vmovdqa %xmm3, 2528(%rsp) +vpshufb shuf48_16(%rip), %ymm11, %ymm11 +vpand mask3_5_3_5(%rip), %ymm11, %ymm3 +vpand mask5_3_5_3(%rip), %ymm11, %ymm11 +vpermq $206, %ymm3, %ymm3 +vpand mask_keephigh(%rip), %ymm3, %ymm7 +vpor %ymm7, %ymm11, %ymm11 +vpaddw 2784(%rsp), %ymm8, %ymm8 +vpaddw %ymm11, %ymm8, %ymm8 +vmovdqa %xmm3, 2784(%rsp) +vpand mask_mod8192(%rip), %ymm5, %ymm5 +vmovdqu %ymm5, 648(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %ymm6, 1000(%rdi) +vpand mask_mod8192(%rip), %ymm8, %ymm8 +vmovdqu %ymm8, 1352(%rdi) +vmovdqa 160(%r12), %ymm0 +vpsubw 256(%r12), %ymm0, %ymm0 +vmovdqa 544(%r12), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 352(%r12), %ymm1, %ymm1 +vpsubw 64(%r12), %ymm0, %ymm0 +vpaddw 448(%r12), %ymm0, %ymm0 +vmovdqa 736(%r12), %ymm2 +vpsubw 832(%r12), %ymm2, %ymm2 +vmovdqa 1120(%r12), %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw 928(%r12), %ymm3, %ymm3 +vpsubw 640(%r12), %ymm2, %ymm2 +vpaddw 1024(%r12), %ymm2, %ymm2 +vmovdqa 1312(%r12), %ymm4 +vpsubw 1408(%r12), %ymm4, %ymm4 +vmovdqa 1696(%r12), %ymm5 +vpsubw %ymm4, %ymm5, %ymm5 +vpsubw 1504(%r12), %ymm5, %ymm5 +vpsubw 1216(%r12), %ymm4, %ymm4 +vpaddw 1600(%r12), %ymm4, %ymm4 +vpsubw 640(%r12), %ymm1, %ymm1 +vpsubw %ymm1, %ymm5, %ymm5 +vpsubw %ymm3, %ymm5, %ymm5 +vpsubw 64(%r12), %ymm1, %ymm1 +vpaddw 1216(%r12), %ymm1, %ymm1 +vmovdqa 352(%r12), %ymm6 +vpsubw %ymm2, %ymm6, %ymm7 +vmovdqa 1504(%r12), %ymm2 +vpsubw %ymm7, %ymm2, %ymm2 +vpsubw 928(%r12), %ymm2, %ymm2 +vpsubw %ymm0, %ymm7, %ymm7 +vpaddw %ymm4, %ymm7, %ymm7 +vmovdqa 64(%r12), %ymm8 +vmovdqa 928(%r12), %ymm9 +vmovdqa %ymm8, 0(%rsp) +vmovdqa %ymm0, 32(%rsp) +vmovdqa %ymm1, 64(%rsp) +vmovdqa %ymm7, 96(%rsp) +vmovdqa %ymm5, 128(%rsp) +vmovdqa %ymm2, 160(%rsp) +vmovdqa %ymm3, 192(%rsp) +vmovdqa %ymm9, 224(%rsp) +vmovdqa 1888(%r12), %ymm0 +vpsubw 1984(%r12), %ymm0, %ymm0 +vmovdqa 2272(%r12), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 2080(%r12), %ymm1, %ymm1 +vpsubw 1792(%r12), %ymm0, %ymm0 +vpaddw 2176(%r12), %ymm0, %ymm0 +vmovdqa 2464(%r12), %ymm2 +vpsubw 2560(%r12), %ymm2, %ymm2 +vmovdqa 2848(%r12), %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw 2656(%r12), %ymm3, %ymm3 +vpsubw 2368(%r12), %ymm2, %ymm2 +vpaddw 2752(%r12), %ymm2, %ymm2 +vmovdqa 3040(%r12), %ymm4 +vpsubw 3136(%r12), %ymm4, %ymm4 +vmovdqa 3424(%r12), %ymm5 +vpsubw %ymm4, %ymm5, %ymm5 +vpsubw 3232(%r12), %ymm5, %ymm5 +vpsubw 2944(%r12), %ymm4, %ymm4 +vpaddw 3328(%r12), %ymm4, %ymm4 +vpsubw 2368(%r12), %ymm1, %ymm1 +vpsubw %ymm1, %ymm5, %ymm5 +vpsubw %ymm3, %ymm5, %ymm5 +vpsubw 1792(%r12), %ymm1, %ymm1 +vpaddw 2944(%r12), %ymm1, %ymm1 +vmovdqa 2080(%r12), %ymm6 +vpsubw %ymm2, %ymm6, %ymm7 +vmovdqa 3232(%r12), %ymm2 +vpsubw %ymm7, %ymm2, %ymm2 +vpsubw 2656(%r12), %ymm2, %ymm2 +vpsubw %ymm0, %ymm7, %ymm7 +vpaddw %ymm4, %ymm7, %ymm7 +vmovdqa 1792(%r12), %ymm8 +vmovdqa 2656(%r12), %ymm9 +vmovdqa %ymm8, 256(%rsp) +vmovdqa %ymm0, 288(%rsp) +vmovdqa %ymm1, 320(%rsp) +vmovdqa %ymm7, 352(%rsp) +vmovdqa %ymm5, 384(%rsp) +vmovdqa %ymm2, 416(%rsp) +vmovdqa %ymm3, 448(%rsp) +vmovdqa %ymm9, 480(%rsp) +vmovdqa 3616(%r12), %ymm0 +vpsubw 3712(%r12), %ymm0, %ymm0 +vmovdqa 4000(%r12), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 3808(%r12), %ymm1, %ymm1 +vpsubw 3520(%r12), %ymm0, %ymm0 +vpaddw 3904(%r12), %ymm0, %ymm0 +vmovdqa 4192(%r12), %ymm2 +vpsubw 4288(%r12), %ymm2, %ymm2 +vmovdqa 4576(%r12), %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw 4384(%r12), %ymm3, %ymm3 +vpsubw 4096(%r12), %ymm2, %ymm2 +vpaddw 4480(%r12), %ymm2, %ymm2 +vmovdqa 4768(%r12), %ymm4 +vpsubw 4864(%r12), %ymm4, %ymm4 +vmovdqa 5152(%r12), %ymm5 +vpsubw %ymm4, %ymm5, %ymm5 +vpsubw 4960(%r12), %ymm5, %ymm5 +vpsubw 4672(%r12), %ymm4, %ymm4 +vpaddw 5056(%r12), %ymm4, %ymm4 +vpsubw 4096(%r12), %ymm1, %ymm1 +vpsubw %ymm1, %ymm5, %ymm5 +vpsubw %ymm3, %ymm5, %ymm5 +vpsubw 3520(%r12), %ymm1, %ymm1 +vpaddw 4672(%r12), %ymm1, %ymm1 +vmovdqa 3808(%r12), %ymm6 +vpsubw %ymm2, %ymm6, %ymm7 +vmovdqa 4960(%r12), %ymm2 +vpsubw %ymm7, %ymm2, %ymm2 +vpsubw 4384(%r12), %ymm2, %ymm2 +vpsubw %ymm0, %ymm7, %ymm7 +vpaddw %ymm4, %ymm7, %ymm7 +vmovdqa 3520(%r12), %ymm8 +vmovdqa 4384(%r12), %ymm9 +vmovdqa %ymm8, 512(%rsp) +vmovdqa %ymm0, 544(%rsp) +vmovdqa %ymm1, 576(%rsp) +vmovdqa %ymm7, 608(%rsp) +vmovdqa %ymm5, 640(%rsp) +vmovdqa %ymm2, 672(%rsp) +vmovdqa %ymm3, 704(%rsp) +vmovdqa %ymm9, 736(%rsp) +vmovdqa 5344(%r12), %ymm0 +vpsubw 5440(%r12), %ymm0, %ymm0 +vmovdqa 5728(%r12), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 5536(%r12), %ymm1, %ymm1 +vpsubw 5248(%r12), %ymm0, %ymm0 +vpaddw 5632(%r12), %ymm0, %ymm0 +vmovdqa 5920(%r12), %ymm2 +vpsubw 6016(%r12), %ymm2, %ymm2 +vmovdqa 6304(%r12), %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw 6112(%r12), %ymm3, %ymm3 +vpsubw 5824(%r12), %ymm2, %ymm2 +vpaddw 6208(%r12), %ymm2, %ymm2 +vmovdqa 6496(%r12), %ymm4 +vpsubw 6592(%r12), %ymm4, %ymm4 +vmovdqa 6880(%r12), %ymm5 +vpsubw %ymm4, %ymm5, %ymm5 +vpsubw 6688(%r12), %ymm5, %ymm5 +vpsubw 6400(%r12), %ymm4, %ymm4 +vpaddw 6784(%r12), %ymm4, %ymm4 +vpsubw 5824(%r12), %ymm1, %ymm1 +vpsubw %ymm1, %ymm5, %ymm5 +vpsubw %ymm3, %ymm5, %ymm5 +vpsubw 5248(%r12), %ymm1, %ymm1 +vpaddw 6400(%r12), %ymm1, %ymm1 +vmovdqa 5536(%r12), %ymm6 +vpsubw %ymm2, %ymm6, %ymm7 +vmovdqa 6688(%r12), %ymm2 +vpsubw %ymm7, %ymm2, %ymm2 +vpsubw 6112(%r12), %ymm2, %ymm2 +vpsubw %ymm0, %ymm7, %ymm7 +vpaddw %ymm4, %ymm7, %ymm7 +vmovdqa 5248(%r12), %ymm8 +vmovdqa 6112(%r12), %ymm9 +vmovdqa %ymm8, 768(%rsp) +vmovdqa %ymm0, 800(%rsp) +vmovdqa %ymm1, 832(%rsp) +vmovdqa %ymm7, 864(%rsp) +vmovdqa %ymm5, 896(%rsp) +vmovdqa %ymm2, 928(%rsp) +vmovdqa %ymm3, 960(%rsp) +vmovdqa %ymm9, 992(%rsp) +vmovdqa 7072(%r12), %ymm0 +vpsubw 7168(%r12), %ymm0, %ymm0 +vmovdqa 7456(%r12), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 7264(%r12), %ymm1, %ymm1 +vpsubw 6976(%r12), %ymm0, %ymm0 +vpaddw 7360(%r12), %ymm0, %ymm0 +vmovdqa 7648(%r12), %ymm2 +vpsubw 7744(%r12), %ymm2, %ymm2 +vmovdqa 8032(%r12), %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw 7840(%r12), %ymm3, %ymm3 +vpsubw 7552(%r12), %ymm2, %ymm2 +vpaddw 7936(%r12), %ymm2, %ymm2 +vmovdqa 8224(%r12), %ymm4 +vpsubw 8320(%r12), %ymm4, %ymm4 +vmovdqa 8608(%r12), %ymm5 +vpsubw %ymm4, %ymm5, %ymm5 +vpsubw 8416(%r12), %ymm5, %ymm5 +vpsubw 8128(%r12), %ymm4, %ymm4 +vpaddw 8512(%r12), %ymm4, %ymm4 +vpsubw 7552(%r12), %ymm1, %ymm1 +vpsubw %ymm1, %ymm5, %ymm5 +vpsubw %ymm3, %ymm5, %ymm5 +vpsubw 6976(%r12), %ymm1, %ymm1 +vpaddw 8128(%r12), %ymm1, %ymm1 +vmovdqa 7264(%r12), %ymm6 +vpsubw %ymm2, %ymm6, %ymm7 +vmovdqa 8416(%r12), %ymm2 +vpsubw %ymm7, %ymm2, %ymm2 +vpsubw 7840(%r12), %ymm2, %ymm2 +vpsubw %ymm0, %ymm7, %ymm7 +vpaddw %ymm4, %ymm7, %ymm7 +vmovdqa 6976(%r12), %ymm8 +vmovdqa 7840(%r12), %ymm9 +vmovdqa %ymm8, 1024(%rsp) +vmovdqa %ymm0, 1056(%rsp) +vmovdqa %ymm1, 1088(%rsp) +vmovdqa %ymm7, 1120(%rsp) +vmovdqa %ymm5, 1152(%rsp) +vmovdqa %ymm2, 1184(%rsp) +vmovdqa %ymm3, 1216(%rsp) +vmovdqa %ymm9, 1248(%rsp) +vmovdqa 8800(%r12), %ymm0 +vpsubw 8896(%r12), %ymm0, %ymm0 +vmovdqa 9184(%r12), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 8992(%r12), %ymm1, %ymm1 +vpsubw 8704(%r12), %ymm0, %ymm0 +vpaddw 9088(%r12), %ymm0, %ymm0 +vmovdqa 9376(%r12), %ymm2 +vpsubw 9472(%r12), %ymm2, %ymm2 +vmovdqa 9760(%r12), %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw 9568(%r12), %ymm3, %ymm3 +vpsubw 9280(%r12), %ymm2, %ymm2 +vpaddw 9664(%r12), %ymm2, %ymm2 +vmovdqa 9952(%r12), %ymm4 +vpsubw 10048(%r12), %ymm4, %ymm4 +vmovdqa 10336(%r12), %ymm5 +vpsubw %ymm4, %ymm5, %ymm5 +vpsubw 10144(%r12), %ymm5, %ymm5 +vpsubw 9856(%r12), %ymm4, %ymm4 +vpaddw 10240(%r12), %ymm4, %ymm4 +vpsubw 9280(%r12), %ymm1, %ymm1 +vpsubw %ymm1, %ymm5, %ymm5 +vpsubw %ymm3, %ymm5, %ymm5 +vpsubw 8704(%r12), %ymm1, %ymm1 +vpaddw 9856(%r12), %ymm1, %ymm1 +vmovdqa 8992(%r12), %ymm6 +vpsubw %ymm2, %ymm6, %ymm7 +vmovdqa 10144(%r12), %ymm2 +vpsubw %ymm7, %ymm2, %ymm2 +vpsubw 9568(%r12), %ymm2, %ymm2 +vpsubw %ymm0, %ymm7, %ymm7 +vpaddw %ymm4, %ymm7, %ymm7 +vmovdqa 8704(%r12), %ymm8 +vmovdqa 9568(%r12), %ymm9 +vmovdqa %ymm8, 1280(%rsp) +vmovdqa %ymm0, 1312(%rsp) +vmovdqa %ymm1, 1344(%rsp) +vmovdqa %ymm7, 1376(%rsp) +vmovdqa %ymm5, 1408(%rsp) +vmovdqa %ymm2, 1440(%rsp) +vmovdqa %ymm3, 1472(%rsp) +vmovdqa %ymm9, 1504(%rsp) +vmovdqa 10528(%r12), %ymm0 +vpsubw 10624(%r12), %ymm0, %ymm0 +vmovdqa 10912(%r12), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 10720(%r12), %ymm1, %ymm1 +vpsubw 10432(%r12), %ymm0, %ymm0 +vpaddw 10816(%r12), %ymm0, %ymm0 +vmovdqa 11104(%r12), %ymm2 +vpsubw 11200(%r12), %ymm2, %ymm2 +vmovdqa 11488(%r12), %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw 11296(%r12), %ymm3, %ymm3 +vpsubw 11008(%r12), %ymm2, %ymm2 +vpaddw 11392(%r12), %ymm2, %ymm2 +vmovdqa 11680(%r12), %ymm4 +vpsubw 11776(%r12), %ymm4, %ymm4 +vmovdqa 12064(%r12), %ymm5 +vpsubw %ymm4, %ymm5, %ymm5 +vpsubw 11872(%r12), %ymm5, %ymm5 +vpsubw 11584(%r12), %ymm4, %ymm4 +vpaddw 11968(%r12), %ymm4, %ymm4 +vpsubw 11008(%r12), %ymm1, %ymm1 +vpsubw %ymm1, %ymm5, %ymm5 +vpsubw %ymm3, %ymm5, %ymm5 +vpsubw 10432(%r12), %ymm1, %ymm1 +vpaddw 11584(%r12), %ymm1, %ymm1 +vmovdqa 10720(%r12), %ymm6 +vpsubw %ymm2, %ymm6, %ymm7 +vmovdqa 11872(%r12), %ymm2 +vpsubw %ymm7, %ymm2, %ymm2 +vpsubw 11296(%r12), %ymm2, %ymm2 +vpsubw %ymm0, %ymm7, %ymm7 +vpaddw %ymm4, %ymm7, %ymm7 +vmovdqa 10432(%r12), %ymm8 +vmovdqa 11296(%r12), %ymm9 +vmovdqa %ymm8, 1536(%rsp) +vmovdqa %ymm0, 1568(%rsp) +vmovdqa %ymm1, 1600(%rsp) +vmovdqa %ymm7, 1632(%rsp) +vmovdqa %ymm5, 1664(%rsp) +vmovdqa %ymm2, 1696(%rsp) +vmovdqa %ymm3, 1728(%rsp) +vmovdqa %ymm9, 1760(%rsp) +vmovdqa 0(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm4 +vpunpckhwd const0(%rip), %ymm11, %ymm2 +vpslld $1, %ymm4, %ymm4 +vpslld $1, %ymm2, %ymm2 +vmovdqa 256(%rsp), %ymm9 +vpunpcklwd const0(%rip), %ymm9, %ymm8 +vpunpckhwd const0(%rip), %ymm9, %ymm9 +vmovdqa 512(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm5 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm5, %ymm8, %ymm3 +vpaddd %ymm6, %ymm9, %ymm7 +vpsubd %ymm4, %ymm3, %ymm3 +vpsubd %ymm2, %ymm7, %ymm7 +vpsubd %ymm5, %ymm8, %ymm5 +vpsubd %ymm6, %ymm9, %ymm6 +vpsrld $1, %ymm5, %ymm5 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm5, %ymm5 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm5, %ymm6 +vmovdqa 1536(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm9 +vpunpckhwd const0(%rip), %ymm5, %ymm8 +vpslld $1, %ymm9, %ymm9 +vpslld $1, %ymm8, %ymm8 +vpsubd %ymm9, %ymm3, %ymm3 +vpsubd %ymm8, %ymm7, %ymm7 +vpsrld $1, %ymm3, %ymm3 +vpsrld $1, %ymm7, %ymm7 +vpand mask32_to_16(%rip), %ymm3, %ymm3 +vpand mask32_to_16(%rip), %ymm7, %ymm7 +vpackusdw %ymm7, %ymm3, %ymm7 +vmovdqa 768(%rsp), %ymm3 +vpaddw 1024(%rsp), %ymm3, %ymm8 +vpsubw 1024(%rsp), %ymm3, %ymm3 +vpsrlw $2, %ymm3, %ymm3 +vpsubw %ymm6, %ymm3, %ymm3 +vpmullw %ymm14, %ymm3, %ymm3 +vpsllw $1, %ymm11, %ymm9 +vpsubw %ymm9, %ymm8, %ymm9 +vpsllw $7, %ymm5, %ymm8 +vpsubw %ymm8, %ymm9, %ymm8 +vpsrlw $3, %ymm8, %ymm8 +vpsubw %ymm7, %ymm8, %ymm8 +vmovdqa 1280(%rsp), %ymm9 +vpsubw %ymm11, %ymm9, %ymm9 +vpmullw %ymm15, %ymm5, %ymm2 +vpsubw %ymm2, %ymm9, %ymm2 +vpmullw %ymm14, %ymm8, %ymm8 +vpsubw %ymm8, %ymm7, %ymm7 +vpmullw %ymm12, %ymm8, %ymm9 +vpaddw %ymm9, %ymm7, %ymm9 +vpmullw %ymm12, %ymm9, %ymm9 +vpsubw %ymm9, %ymm2, %ymm9 +vpmullw %ymm14, %ymm9, %ymm9 +vpsubw %ymm6, %ymm9, %ymm9 +vpsrlw $3, %ymm9, %ymm9 +vpsubw %ymm3, %ymm9, %ymm9 +vpsubw %ymm9, %ymm3, %ymm3 +vpsubw %ymm3, %ymm6, %ymm6 +vpmullw %ymm13, %ymm9, %ymm9 +vpsubw %ymm9, %ymm6, %ymm6 +vpshufb shuf48_16(%rip), %ymm8, %ymm8 +vpand mask3_5_4_3_1(%rip), %ymm8, %ymm2 +vpand mask5_3_5_3(%rip), %ymm8, %ymm8 +vpermq $139, %ymm2, %ymm2 +vpand mask_keephigh(%rip), %ymm2, %ymm4 +vpor %ymm4, %ymm8, %ymm8 +vpaddw 2048(%rsp), %ymm11, %ymm11 +vpaddw %ymm8, %ymm11, %ymm11 +vmovdqa %xmm2, 2048(%rsp) +vpshufb shuf48_16(%rip), %ymm9, %ymm9 +vpand mask3_5_4_3_1(%rip), %ymm9, %ymm2 +vpand mask5_3_5_3(%rip), %ymm9, %ymm9 +vpermq $139, %ymm2, %ymm2 +vpand mask_keephigh(%rip), %ymm2, %ymm4 +vpor %ymm4, %ymm9, %ymm9 +vpaddw 2304(%rsp), %ymm6, %ymm6 +vpaddw %ymm9, %ymm6, %ymm6 +vmovdqa %xmm2, 2304(%rsp) +vpshufb shuf48_16(%rip), %ymm5, %ymm5 +vpand mask3_5_4_3_1(%rip), %ymm5, %ymm2 +vpand mask5_3_5_3(%rip), %ymm5, %ymm5 +vpermq $139, %ymm2, %ymm2 +vpand mask_keephigh(%rip), %ymm2, %ymm4 +vpor %ymm4, %ymm5, %ymm5 +vpaddw 2560(%rsp), %ymm7, %ymm7 +vpaddw %ymm5, %ymm7, %ymm7 +vmovdqa %xmm2, 2560(%rsp) +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %xmm11, 64(%rdi) +vextracti128 $1, %ymm11, %xmm11 +vmovq %xmm11, 80(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %xmm6, 416(%rdi) +vextracti128 $1, %ymm6, %xmm6 +vmovq %xmm6, 432(%rdi) +vpand mask_mod8192(%rip), %ymm7, %ymm7 +vmovdqu %xmm7, 768(%rdi) +vextracti128 $1, %ymm7, %xmm7 +vmovq %xmm7, 784(%rdi) +vpand mask_mod8192(%rip), %ymm3, %ymm3 +vmovdqu %xmm3, 1120(%rdi) +vextracti128 $1, %ymm3, %xmm3 +vmovq %xmm3, 1136(%rdi) +vmovdqa 32(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm9 +vpunpckhwd const0(%rip), %ymm5, %ymm8 +vpslld $1, %ymm9, %ymm9 +vpslld $1, %ymm8, %ymm8 +vmovdqa 288(%rsp), %ymm3 +vpunpcklwd const0(%rip), %ymm3, %ymm7 +vpunpckhwd const0(%rip), %ymm3, %ymm3 +vmovdqa 544(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm11 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm11, %ymm7, %ymm2 +vpaddd %ymm6, %ymm3, %ymm4 +vpsubd %ymm9, %ymm2, %ymm2 +vpsubd %ymm8, %ymm4, %ymm4 +vpsubd %ymm11, %ymm7, %ymm11 +vpsubd %ymm6, %ymm3, %ymm6 +vpsrld $1, %ymm11, %ymm11 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm11, %ymm11 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm11, %ymm6 +vmovdqa 1568(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm3 +vpunpckhwd const0(%rip), %ymm11, %ymm7 +vpslld $1, %ymm3, %ymm3 +vpslld $1, %ymm7, %ymm7 +vpsubd %ymm3, %ymm2, %ymm2 +vpsubd %ymm7, %ymm4, %ymm4 +vpsrld $1, %ymm2, %ymm2 +vpsrld $1, %ymm4, %ymm4 +vpand mask32_to_16(%rip), %ymm2, %ymm2 +vpand mask32_to_16(%rip), %ymm4, %ymm4 +vpackusdw %ymm4, %ymm2, %ymm4 +vmovdqa 800(%rsp), %ymm2 +vpaddw 1056(%rsp), %ymm2, %ymm7 +vpsubw 1056(%rsp), %ymm2, %ymm2 +vpsrlw $2, %ymm2, %ymm2 +vpsubw %ymm6, %ymm2, %ymm2 +vpmullw %ymm14, %ymm2, %ymm2 +vpsllw $1, %ymm5, %ymm3 +vpsubw %ymm3, %ymm7, %ymm3 +vpsllw $7, %ymm11, %ymm7 +vpsubw %ymm7, %ymm3, %ymm7 +vpsrlw $3, %ymm7, %ymm7 +vpsubw %ymm4, %ymm7, %ymm7 +vmovdqa 1312(%rsp), %ymm3 +vpsubw %ymm5, %ymm3, %ymm3 +vpmullw %ymm15, %ymm11, %ymm8 +vpsubw %ymm8, %ymm3, %ymm8 +vpmullw %ymm14, %ymm7, %ymm7 +vpsubw %ymm7, %ymm4, %ymm4 +vpmullw %ymm12, %ymm7, %ymm3 +vpaddw %ymm3, %ymm4, %ymm3 +vpmullw %ymm12, %ymm3, %ymm3 +vpsubw %ymm3, %ymm8, %ymm3 +vpmullw %ymm14, %ymm3, %ymm3 +vpsubw %ymm6, %ymm3, %ymm3 +vpsrlw $3, %ymm3, %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw %ymm3, %ymm2, %ymm2 +vpsubw %ymm2, %ymm6, %ymm6 +vpmullw %ymm13, %ymm3, %ymm3 +vpsubw %ymm3, %ymm6, %ymm6 +vpshufb shuf48_16(%rip), %ymm7, %ymm7 +vpand mask3_5_4_3_1(%rip), %ymm7, %ymm8 +vpand mask5_3_5_3(%rip), %ymm7, %ymm7 +vpermq $139, %ymm8, %ymm8 +vpand mask_keephigh(%rip), %ymm8, %ymm9 +vpor %ymm9, %ymm7, %ymm7 +vpaddw 2080(%rsp), %ymm5, %ymm5 +vpaddw %ymm7, %ymm5, %ymm5 +vmovdqa %xmm8, 2080(%rsp) +vpshufb shuf48_16(%rip), %ymm3, %ymm3 +vpand mask3_5_4_3_1(%rip), %ymm3, %ymm8 +vpand mask5_3_5_3(%rip), %ymm3, %ymm3 +vpermq $139, %ymm8, %ymm8 +vpand mask_keephigh(%rip), %ymm8, %ymm9 +vpor %ymm9, %ymm3, %ymm3 +vpaddw 2336(%rsp), %ymm6, %ymm6 +vpaddw %ymm3, %ymm6, %ymm6 +vmovdqa %xmm8, 2336(%rsp) +vpshufb shuf48_16(%rip), %ymm11, %ymm11 +vpand mask3_5_4_3_1(%rip), %ymm11, %ymm8 +vpand mask5_3_5_3(%rip), %ymm11, %ymm11 +vpermq $139, %ymm8, %ymm8 +vpand mask_keephigh(%rip), %ymm8, %ymm9 +vpor %ymm9, %ymm11, %ymm11 +vpaddw 2592(%rsp), %ymm4, %ymm4 +vpaddw %ymm11, %ymm4, %ymm4 +vmovdqa %xmm8, 2592(%rsp) +vpand mask_mod8192(%rip), %ymm5, %ymm5 +vmovdqu %xmm5, 152(%rdi) +vextracti128 $1, %ymm5, %xmm5 +vmovq %xmm5, 168(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %xmm6, 504(%rdi) +vextracti128 $1, %ymm6, %xmm6 +vmovq %xmm6, 520(%rdi) +vpand mask_mod8192(%rip), %ymm4, %ymm4 +vmovdqu %xmm4, 856(%rdi) +vextracti128 $1, %ymm4, %xmm4 +vmovq %xmm4, 872(%rdi) +vpand mask_mod8192(%rip), %ymm2, %ymm2 +vmovdqu %xmm2, 1208(%rdi) +vextracti128 $1, %ymm2, %xmm2 +vmovq %xmm2, 1224(%rdi) +vmovdqa 64(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm3 +vpunpckhwd const0(%rip), %ymm11, %ymm7 +vpslld $1, %ymm3, %ymm3 +vpslld $1, %ymm7, %ymm7 +vmovdqa 320(%rsp), %ymm2 +vpunpcklwd const0(%rip), %ymm2, %ymm4 +vpunpckhwd const0(%rip), %ymm2, %ymm2 +vmovdqa 576(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm5 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm5, %ymm4, %ymm8 +vpaddd %ymm6, %ymm2, %ymm9 +vpsubd %ymm3, %ymm8, %ymm8 +vpsubd %ymm7, %ymm9, %ymm9 +vpsubd %ymm5, %ymm4, %ymm5 +vpsubd %ymm6, %ymm2, %ymm6 +vpsrld $1, %ymm5, %ymm5 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm5, %ymm5 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm5, %ymm6 +vmovdqa 1600(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm2 +vpunpckhwd const0(%rip), %ymm5, %ymm4 +vpslld $1, %ymm2, %ymm2 +vpslld $1, %ymm4, %ymm4 +vpsubd %ymm2, %ymm8, %ymm8 +vpsubd %ymm4, %ymm9, %ymm9 +vpsrld $1, %ymm8, %ymm8 +vpsrld $1, %ymm9, %ymm9 +vpand mask32_to_16(%rip), %ymm8, %ymm8 +vpand mask32_to_16(%rip), %ymm9, %ymm9 +vpackusdw %ymm9, %ymm8, %ymm9 +vmovdqa 832(%rsp), %ymm8 +vpaddw 1088(%rsp), %ymm8, %ymm4 +vpsubw 1088(%rsp), %ymm8, %ymm8 +vpsrlw $2, %ymm8, %ymm8 +vpsubw %ymm6, %ymm8, %ymm8 +vpmullw %ymm14, %ymm8, %ymm8 +vpsllw $1, %ymm11, %ymm2 +vpsubw %ymm2, %ymm4, %ymm2 +vpsllw $7, %ymm5, %ymm4 +vpsubw %ymm4, %ymm2, %ymm4 +vpsrlw $3, %ymm4, %ymm4 +vpsubw %ymm9, %ymm4, %ymm4 +vmovdqa 1344(%rsp), %ymm2 +vpsubw %ymm11, %ymm2, %ymm2 +vpmullw %ymm15, %ymm5, %ymm7 +vpsubw %ymm7, %ymm2, %ymm7 +vpmullw %ymm14, %ymm4, %ymm4 +vpsubw %ymm4, %ymm9, %ymm9 +vpmullw %ymm12, %ymm4, %ymm2 +vpaddw %ymm2, %ymm9, %ymm2 +vpmullw %ymm12, %ymm2, %ymm2 +vpsubw %ymm2, %ymm7, %ymm2 +vpmullw %ymm14, %ymm2, %ymm2 +vpsubw %ymm6, %ymm2, %ymm2 +vpsrlw $3, %ymm2, %ymm2 +vpsubw %ymm8, %ymm2, %ymm2 +vpsubw %ymm2, %ymm8, %ymm8 +vpsubw %ymm8, %ymm6, %ymm6 +vpmullw %ymm13, %ymm2, %ymm2 +vpsubw %ymm2, %ymm6, %ymm6 +vpshufb shuf48_16(%rip), %ymm4, %ymm4 +vpand mask3_5_4_3_1(%rip), %ymm4, %ymm7 +vpand mask5_3_5_3(%rip), %ymm4, %ymm4 +vpermq $139, %ymm7, %ymm7 +vpand mask_keephigh(%rip), %ymm7, %ymm3 +vpor %ymm3, %ymm4, %ymm4 +vpaddw 2112(%rsp), %ymm11, %ymm11 +vpaddw %ymm4, %ymm11, %ymm11 +vmovdqa %xmm7, 2112(%rsp) +vpshufb shuf48_16(%rip), %ymm2, %ymm2 +vpand mask3_5_4_3_1(%rip), %ymm2, %ymm7 +vpand mask5_3_5_3(%rip), %ymm2, %ymm2 +vpermq $139, %ymm7, %ymm7 +vpand mask_keephigh(%rip), %ymm7, %ymm3 +vpor %ymm3, %ymm2, %ymm2 +vpaddw 2368(%rsp), %ymm6, %ymm6 +vpaddw %ymm2, %ymm6, %ymm6 +vmovdqa %xmm7, 2368(%rsp) +vpshufb shuf48_16(%rip), %ymm5, %ymm5 +vpand mask3_5_4_3_1(%rip), %ymm5, %ymm7 +vpand mask5_3_5_3(%rip), %ymm5, %ymm5 +vpermq $139, %ymm7, %ymm7 +vpand mask_keephigh(%rip), %ymm7, %ymm3 +vpor %ymm3, %ymm5, %ymm5 +vpaddw 2624(%rsp), %ymm9, %ymm9 +vpaddw %ymm5, %ymm9, %ymm9 +vmovdqa %xmm7, 2624(%rsp) +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %xmm11, 240(%rdi) +vextracti128 $1, %ymm11, %xmm11 +vmovq %xmm11, 256(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %xmm6, 592(%rdi) +vextracti128 $1, %ymm6, %xmm6 +vmovq %xmm6, 608(%rdi) +vpand mask_mod8192(%rip), %ymm9, %ymm9 +vmovdqu %xmm9, 944(%rdi) +vextracti128 $1, %ymm9, %xmm9 +vmovq %xmm9, 960(%rdi) +vpand mask_mod8192(%rip), %ymm8, %ymm8 +vmovdqu %xmm8, 1296(%rdi) +vextracti128 $1, %ymm8, %xmm8 +vmovq %xmm8, 1312(%rdi) +vmovdqa 96(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm2 +vpunpckhwd const0(%rip), %ymm5, %ymm4 +vpslld $1, %ymm2, %ymm2 +vpslld $1, %ymm4, %ymm4 +vmovdqa 352(%rsp), %ymm8 +vpunpcklwd const0(%rip), %ymm8, %ymm9 +vpunpckhwd const0(%rip), %ymm8, %ymm8 +vmovdqa 608(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm11 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm11, %ymm9, %ymm7 +vpaddd %ymm6, %ymm8, %ymm3 +vpsubd %ymm2, %ymm7, %ymm7 +vpsubd %ymm4, %ymm3, %ymm3 +vpsubd %ymm11, %ymm9, %ymm11 +vpsubd %ymm6, %ymm8, %ymm6 +vpsrld $1, %ymm11, %ymm11 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm11, %ymm11 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm11, %ymm6 +vmovdqa 1632(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm8 +vpunpckhwd const0(%rip), %ymm11, %ymm9 +vpslld $1, %ymm8, %ymm8 +vpslld $1, %ymm9, %ymm9 +vpsubd %ymm8, %ymm7, %ymm7 +vpsubd %ymm9, %ymm3, %ymm3 +vpsrld $1, %ymm7, %ymm7 +vpsrld $1, %ymm3, %ymm3 +vpand mask32_to_16(%rip), %ymm7, %ymm7 +vpand mask32_to_16(%rip), %ymm3, %ymm3 +vpackusdw %ymm3, %ymm7, %ymm3 +vmovdqa 864(%rsp), %ymm7 +vpaddw 1120(%rsp), %ymm7, %ymm9 +vpsubw 1120(%rsp), %ymm7, %ymm7 +vpsrlw $2, %ymm7, %ymm7 +vpsubw %ymm6, %ymm7, %ymm7 +vpmullw %ymm14, %ymm7, %ymm7 +vpsllw $1, %ymm5, %ymm8 +vpsubw %ymm8, %ymm9, %ymm8 +vpsllw $7, %ymm11, %ymm9 +vpsubw %ymm9, %ymm8, %ymm9 +vpsrlw $3, %ymm9, %ymm9 +vpsubw %ymm3, %ymm9, %ymm9 +vmovdqa 1376(%rsp), %ymm8 +vpsubw %ymm5, %ymm8, %ymm8 +vpmullw %ymm15, %ymm11, %ymm4 +vpsubw %ymm4, %ymm8, %ymm4 +vpmullw %ymm14, %ymm9, %ymm9 +vpsubw %ymm9, %ymm3, %ymm3 +vpmullw %ymm12, %ymm9, %ymm8 +vpaddw %ymm8, %ymm3, %ymm8 +vpmullw %ymm12, %ymm8, %ymm8 +vpsubw %ymm8, %ymm4, %ymm8 +vpmullw %ymm14, %ymm8, %ymm8 +vpsubw %ymm6, %ymm8, %ymm8 +vpsrlw $3, %ymm8, %ymm8 +vpsubw %ymm7, %ymm8, %ymm8 +vpsubw %ymm8, %ymm7, %ymm7 +vpsubw %ymm7, %ymm6, %ymm6 +vpmullw %ymm13, %ymm8, %ymm8 +vpsubw %ymm8, %ymm6, %ymm6 +vpshufb shuf48_16(%rip), %ymm9, %ymm9 +vpand mask3_5_4_3_1(%rip), %ymm9, %ymm4 +vpand mask5_3_5_3(%rip), %ymm9, %ymm9 +vpermq $139, %ymm4, %ymm4 +vpand mask_keephigh(%rip), %ymm4, %ymm2 +vpor %ymm2, %ymm9, %ymm9 +vpaddw 2144(%rsp), %ymm5, %ymm5 +vpaddw %ymm9, %ymm5, %ymm5 +vmovdqa %xmm4, 2144(%rsp) +vpshufb shuf48_16(%rip), %ymm8, %ymm8 +vpand mask3_5_4_3_1(%rip), %ymm8, %ymm4 +vpand mask5_3_5_3(%rip), %ymm8, %ymm8 +vpermq $139, %ymm4, %ymm4 +vpand mask_keephigh(%rip), %ymm4, %ymm2 +vpor %ymm2, %ymm8, %ymm8 +vpaddw 2400(%rsp), %ymm6, %ymm6 +vpaddw %ymm8, %ymm6, %ymm6 +vmovdqa %xmm4, 2400(%rsp) +vpshufb shuf48_16(%rip), %ymm11, %ymm11 +vpand mask3_5_4_3_1(%rip), %ymm11, %ymm4 +vpand mask5_3_5_3(%rip), %ymm11, %ymm11 +vpermq $139, %ymm4, %ymm4 +vpand mask_keephigh(%rip), %ymm4, %ymm2 +vpor %ymm2, %ymm11, %ymm11 +vpaddw 2656(%rsp), %ymm3, %ymm3 +vpaddw %ymm11, %ymm3, %ymm3 +vmovdqa %xmm4, 2656(%rsp) +vpand mask_mod8192(%rip), %ymm5, %ymm5 +vmovdqu %xmm5, 328(%rdi) +vextracti128 $1, %ymm5, %xmm5 +vmovq %xmm5, 344(%rdi) +vpshufb shufmin1_mask3(%rip), %ymm5, %ymm5 +vmovdqa %xmm5, 1792(%rsp) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %xmm6, 680(%rdi) +vextracti128 $1, %ymm6, %xmm6 +vmovq %xmm6, 696(%rdi) +vpshufb shufmin1_mask3(%rip), %ymm6, %ymm6 +vmovdqa %xmm6, 1824(%rsp) +vpand mask_mod8192(%rip), %ymm3, %ymm3 +vmovdqu %xmm3, 1032(%rdi) +vextracti128 $1, %ymm3, %xmm3 +vmovq %xmm3, 1048(%rdi) +vpshufb shufmin1_mask3(%rip), %ymm3, %ymm3 +vmovdqa %xmm3, 1856(%rsp) +vpand mask_mod8192(%rip), %ymm7, %ymm7 +vmovdqu %xmm7, 1384(%rdi) +vextracti128 $1, %ymm7, %xmm7 +vpextrw $0, %xmm7, 1400(%rdi) +vpshufb shufmin1_mask3(%rip), %ymm7, %ymm7 +vmovdqa %xmm7, 1888(%rsp) +vmovdqa 128(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm8 +vpunpckhwd const0(%rip), %ymm11, %ymm9 +vpslld $1, %ymm8, %ymm8 +vpslld $1, %ymm9, %ymm9 +vmovdqa 384(%rsp), %ymm7 +vpunpcklwd const0(%rip), %ymm7, %ymm3 +vpunpckhwd const0(%rip), %ymm7, %ymm7 +vmovdqa 640(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm5 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm5, %ymm3, %ymm4 +vpaddd %ymm6, %ymm7, %ymm2 +vpsubd %ymm8, %ymm4, %ymm4 +vpsubd %ymm9, %ymm2, %ymm2 +vpsubd %ymm5, %ymm3, %ymm5 +vpsubd %ymm6, %ymm7, %ymm6 +vpsrld $1, %ymm5, %ymm5 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm5, %ymm5 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm5, %ymm6 +vmovdqa 1664(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm7 +vpunpckhwd const0(%rip), %ymm5, %ymm3 +vpslld $1, %ymm7, %ymm7 +vpslld $1, %ymm3, %ymm3 +vpsubd %ymm7, %ymm4, %ymm4 +vpsubd %ymm3, %ymm2, %ymm2 +vpsrld $1, %ymm4, %ymm4 +vpsrld $1, %ymm2, %ymm2 +vpand mask32_to_16(%rip), %ymm4, %ymm4 +vpand mask32_to_16(%rip), %ymm2, %ymm2 +vpackusdw %ymm2, %ymm4, %ymm2 +vmovdqa 896(%rsp), %ymm4 +vpaddw 1152(%rsp), %ymm4, %ymm3 +vpsubw 1152(%rsp), %ymm4, %ymm4 +vpsrlw $2, %ymm4, %ymm4 +vpsubw %ymm6, %ymm4, %ymm4 +vpmullw %ymm14, %ymm4, %ymm4 +vpsllw $1, %ymm11, %ymm7 +vpsubw %ymm7, %ymm3, %ymm7 +vpsllw $7, %ymm5, %ymm3 +vpsubw %ymm3, %ymm7, %ymm3 +vpsrlw $3, %ymm3, %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vmovdqa 1408(%rsp), %ymm7 +vpsubw %ymm11, %ymm7, %ymm7 +vpmullw %ymm15, %ymm5, %ymm9 +vpsubw %ymm9, %ymm7, %ymm9 +vpmullw %ymm14, %ymm3, %ymm3 +vpsubw %ymm3, %ymm2, %ymm2 +vpmullw %ymm12, %ymm3, %ymm7 +vpaddw %ymm7, %ymm2, %ymm7 +vpmullw %ymm12, %ymm7, %ymm7 +vpsubw %ymm7, %ymm9, %ymm7 +vpmullw %ymm14, %ymm7, %ymm7 +vpsubw %ymm6, %ymm7, %ymm7 +vpsrlw $3, %ymm7, %ymm7 +vpsubw %ymm4, %ymm7, %ymm7 +vpsubw %ymm7, %ymm4, %ymm4 +vpsubw %ymm4, %ymm6, %ymm6 +vpmullw %ymm13, %ymm7, %ymm7 +vpsubw %ymm7, %ymm6, %ymm6 +vmovdqu 416(%rdi), %ymm9 +vmovdqu 768(%rdi), %ymm8 +vmovdqu 1120(%rdi), %ymm10 +vpaddw %ymm11, %ymm9, %ymm11 +vpaddw %ymm6, %ymm8, %ymm6 +vpaddw %ymm2, %ymm10, %ymm2 +vpshufb shuf48_16(%rip), %ymm4, %ymm4 +vpand mask3_5_4_3_1(%rip), %ymm4, %ymm10 +vpand mask5_3_5_3(%rip), %ymm4, %ymm4 +vpermq $139, %ymm10, %ymm10 +vpand mask_keephigh(%rip), %ymm10, %ymm8 +vpor %ymm8, %ymm4, %ymm4 +vmovdqu 64(%rdi), %ymm8 +vpaddw 1920(%rsp), %ymm8, %ymm8 +vpaddw %ymm4, %ymm8, %ymm8 +vpand mask_mod8192(%rip), %ymm8, %ymm8 +vmovdqu %xmm8, 64(%rdi) +vextracti128 $1, %ymm8, %xmm8 +vmovq %xmm8, 80(%rdi) +vmovdqa %xmm10, 1920(%rsp) +vpshufb shuf48_16(%rip), %ymm3, %ymm3 +vpand mask3_5_4_3_1(%rip), %ymm3, %ymm10 +vpand mask5_3_5_3(%rip), %ymm3, %ymm3 +vpermq $139, %ymm10, %ymm10 +vpand mask_keephigh(%rip), %ymm10, %ymm8 +vpor %ymm8, %ymm3, %ymm3 +vpaddw 2176(%rsp), %ymm11, %ymm11 +vpaddw %ymm3, %ymm11, %ymm11 +vmovdqa %xmm10, 2176(%rsp) +vpshufb shuf48_16(%rip), %ymm7, %ymm7 +vpand mask3_5_4_3_1(%rip), %ymm7, %ymm10 +vpand mask5_3_5_3(%rip), %ymm7, %ymm7 +vpermq $139, %ymm10, %ymm10 +vpand mask_keephigh(%rip), %ymm10, %ymm8 +vpor %ymm8, %ymm7, %ymm7 +vpaddw 2432(%rsp), %ymm6, %ymm6 +vpaddw %ymm7, %ymm6, %ymm6 +vmovdqa %xmm10, 2432(%rsp) +vpshufb shuf48_16(%rip), %ymm5, %ymm5 +vpand mask3_5_4_3_1(%rip), %ymm5, %ymm10 +vpand mask5_3_5_3(%rip), %ymm5, %ymm5 +vpermq $139, %ymm10, %ymm10 +vpand mask_keephigh(%rip), %ymm10, %ymm8 +vpor %ymm8, %ymm5, %ymm5 +vpaddw 2688(%rsp), %ymm2, %ymm2 +vpaddw %ymm5, %ymm2, %ymm2 +vmovdqa %xmm10, 2688(%rsp) +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %xmm11, 416(%rdi) +vextracti128 $1, %ymm11, %xmm11 +vmovq %xmm11, 432(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %xmm6, 768(%rdi) +vextracti128 $1, %ymm6, %xmm6 +vmovq %xmm6, 784(%rdi) +vpand mask_mod8192(%rip), %ymm2, %ymm2 +vmovdqu %xmm2, 1120(%rdi) +vextracti128 $1, %ymm2, %xmm2 +vmovq %xmm2, 1136(%rdi) +vmovdqa 160(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm7 +vpunpckhwd const0(%rip), %ymm5, %ymm3 +vpslld $1, %ymm7, %ymm7 +vpslld $1, %ymm3, %ymm3 +vmovdqa 416(%rsp), %ymm4 +vpunpcklwd const0(%rip), %ymm4, %ymm2 +vpunpckhwd const0(%rip), %ymm4, %ymm4 +vmovdqa 672(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm11 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm11, %ymm2, %ymm10 +vpaddd %ymm6, %ymm4, %ymm8 +vpsubd %ymm7, %ymm10, %ymm10 +vpsubd %ymm3, %ymm8, %ymm8 +vpsubd %ymm11, %ymm2, %ymm11 +vpsubd %ymm6, %ymm4, %ymm6 +vpsrld $1, %ymm11, %ymm11 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm11, %ymm11 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm11, %ymm6 +vmovdqa 1696(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm4 +vpunpckhwd const0(%rip), %ymm11, %ymm2 +vpslld $1, %ymm4, %ymm4 +vpslld $1, %ymm2, %ymm2 +vpsubd %ymm4, %ymm10, %ymm10 +vpsubd %ymm2, %ymm8, %ymm8 +vpsrld $1, %ymm10, %ymm10 +vpsrld $1, %ymm8, %ymm8 +vpand mask32_to_16(%rip), %ymm10, %ymm10 +vpand mask32_to_16(%rip), %ymm8, %ymm8 +vpackusdw %ymm8, %ymm10, %ymm8 +vmovdqa 928(%rsp), %ymm10 +vpaddw 1184(%rsp), %ymm10, %ymm2 +vpsubw 1184(%rsp), %ymm10, %ymm10 +vpsrlw $2, %ymm10, %ymm10 +vpsubw %ymm6, %ymm10, %ymm10 +vpmullw %ymm14, %ymm10, %ymm10 +vpsllw $1, %ymm5, %ymm4 +vpsubw %ymm4, %ymm2, %ymm4 +vpsllw $7, %ymm11, %ymm2 +vpsubw %ymm2, %ymm4, %ymm2 +vpsrlw $3, %ymm2, %ymm2 +vpsubw %ymm8, %ymm2, %ymm2 +vmovdqa 1440(%rsp), %ymm4 +vpsubw %ymm5, %ymm4, %ymm4 +vpmullw %ymm15, %ymm11, %ymm3 +vpsubw %ymm3, %ymm4, %ymm3 +vpmullw %ymm14, %ymm2, %ymm2 +vpsubw %ymm2, %ymm8, %ymm8 +vpmullw %ymm12, %ymm2, %ymm4 +vpaddw %ymm4, %ymm8, %ymm4 +vpmullw %ymm12, %ymm4, %ymm4 +vpsubw %ymm4, %ymm3, %ymm4 +vpmullw %ymm14, %ymm4, %ymm4 +vpsubw %ymm6, %ymm4, %ymm4 +vpsrlw $3, %ymm4, %ymm4 +vpsubw %ymm10, %ymm4, %ymm4 +vpsubw %ymm4, %ymm10, %ymm10 +vpsubw %ymm10, %ymm6, %ymm6 +vpmullw %ymm13, %ymm4, %ymm4 +vpsubw %ymm4, %ymm6, %ymm6 +vmovdqu 504(%rdi), %ymm3 +vmovdqu 856(%rdi), %ymm7 +vmovdqu 1208(%rdi), %ymm9 +vpaddw %ymm5, %ymm3, %ymm5 +vpaddw %ymm6, %ymm7, %ymm6 +vpaddw %ymm8, %ymm9, %ymm8 +vpshufb shuf48_16(%rip), %ymm10, %ymm10 +vpand mask3_5_4_3_1(%rip), %ymm10, %ymm9 +vpand mask5_3_5_3(%rip), %ymm10, %ymm10 +vpermq $139, %ymm9, %ymm9 +vpand mask_keephigh(%rip), %ymm9, %ymm7 +vpor %ymm7, %ymm10, %ymm10 +vmovdqu 152(%rdi), %ymm7 +vpaddw 1952(%rsp), %ymm7, %ymm7 +vpaddw %ymm10, %ymm7, %ymm7 +vpand mask_mod8192(%rip), %ymm7, %ymm7 +vmovdqu %xmm7, 152(%rdi) +vextracti128 $1, %ymm7, %xmm7 +vmovq %xmm7, 168(%rdi) +vmovdqa %xmm9, 1952(%rsp) +vpshufb shuf48_16(%rip), %ymm2, %ymm2 +vpand mask3_5_4_3_1(%rip), %ymm2, %ymm9 +vpand mask5_3_5_3(%rip), %ymm2, %ymm2 +vpermq $139, %ymm9, %ymm9 +vpand mask_keephigh(%rip), %ymm9, %ymm7 +vpor %ymm7, %ymm2, %ymm2 +vpaddw 2208(%rsp), %ymm5, %ymm5 +vpaddw %ymm2, %ymm5, %ymm5 +vmovdqa %xmm9, 2208(%rsp) +vpshufb shuf48_16(%rip), %ymm4, %ymm4 +vpand mask3_5_4_3_1(%rip), %ymm4, %ymm9 +vpand mask5_3_5_3(%rip), %ymm4, %ymm4 +vpermq $139, %ymm9, %ymm9 +vpand mask_keephigh(%rip), %ymm9, %ymm7 +vpor %ymm7, %ymm4, %ymm4 +vpaddw 2464(%rsp), %ymm6, %ymm6 +vpaddw %ymm4, %ymm6, %ymm6 +vmovdqa %xmm9, 2464(%rsp) +vpshufb shuf48_16(%rip), %ymm11, %ymm11 +vpand mask3_5_4_3_1(%rip), %ymm11, %ymm9 +vpand mask5_3_5_3(%rip), %ymm11, %ymm11 +vpermq $139, %ymm9, %ymm9 +vpand mask_keephigh(%rip), %ymm9, %ymm7 +vpor %ymm7, %ymm11, %ymm11 +vpaddw 2720(%rsp), %ymm8, %ymm8 +vpaddw %ymm11, %ymm8, %ymm8 +vmovdqa %xmm9, 2720(%rsp) +vpand mask_mod8192(%rip), %ymm5, %ymm5 +vmovdqu %xmm5, 504(%rdi) +vextracti128 $1, %ymm5, %xmm5 +vmovq %xmm5, 520(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %xmm6, 856(%rdi) +vextracti128 $1, %ymm6, %xmm6 +vmovq %xmm6, 872(%rdi) +vpand mask_mod8192(%rip), %ymm8, %ymm8 +vmovdqu %xmm8, 1208(%rdi) +vextracti128 $1, %ymm8, %xmm8 +vmovq %xmm8, 1224(%rdi) +vmovdqa 192(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm4 +vpunpckhwd const0(%rip), %ymm11, %ymm2 +vpslld $1, %ymm4, %ymm4 +vpslld $1, %ymm2, %ymm2 +vmovdqa 448(%rsp), %ymm10 +vpunpcklwd const0(%rip), %ymm10, %ymm8 +vpunpckhwd const0(%rip), %ymm10, %ymm10 +vmovdqa 704(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm5 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm5, %ymm8, %ymm9 +vpaddd %ymm6, %ymm10, %ymm7 +vpsubd %ymm4, %ymm9, %ymm9 +vpsubd %ymm2, %ymm7, %ymm7 +vpsubd %ymm5, %ymm8, %ymm5 +vpsubd %ymm6, %ymm10, %ymm6 +vpsrld $1, %ymm5, %ymm5 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm5, %ymm5 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm5, %ymm6 +vmovdqa 1728(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm10 +vpunpckhwd const0(%rip), %ymm5, %ymm8 +vpslld $1, %ymm10, %ymm10 +vpslld $1, %ymm8, %ymm8 +vpsubd %ymm10, %ymm9, %ymm9 +vpsubd %ymm8, %ymm7, %ymm7 +vpsrld $1, %ymm9, %ymm9 +vpsrld $1, %ymm7, %ymm7 +vpand mask32_to_16(%rip), %ymm9, %ymm9 +vpand mask32_to_16(%rip), %ymm7, %ymm7 +vpackusdw %ymm7, %ymm9, %ymm7 +vmovdqa 960(%rsp), %ymm9 +vpaddw 1216(%rsp), %ymm9, %ymm8 +vpsubw 1216(%rsp), %ymm9, %ymm9 +vpsrlw $2, %ymm9, %ymm9 +vpsubw %ymm6, %ymm9, %ymm9 +vpmullw %ymm14, %ymm9, %ymm9 +vpsllw $1, %ymm11, %ymm10 +vpsubw %ymm10, %ymm8, %ymm10 +vpsllw $7, %ymm5, %ymm8 +vpsubw %ymm8, %ymm10, %ymm8 +vpsrlw $3, %ymm8, %ymm8 +vpsubw %ymm7, %ymm8, %ymm8 +vmovdqa 1472(%rsp), %ymm10 +vpsubw %ymm11, %ymm10, %ymm10 +vpmullw %ymm15, %ymm5, %ymm2 +vpsubw %ymm2, %ymm10, %ymm2 +vpmullw %ymm14, %ymm8, %ymm8 +vpsubw %ymm8, %ymm7, %ymm7 +vpmullw %ymm12, %ymm8, %ymm10 +vpaddw %ymm10, %ymm7, %ymm10 +vpmullw %ymm12, %ymm10, %ymm10 +vpsubw %ymm10, %ymm2, %ymm10 +vpmullw %ymm14, %ymm10, %ymm10 +vpsubw %ymm6, %ymm10, %ymm10 +vpsrlw $3, %ymm10, %ymm10 +vpsubw %ymm9, %ymm10, %ymm10 +vpsubw %ymm10, %ymm9, %ymm9 +vpsubw %ymm9, %ymm6, %ymm6 +vpmullw %ymm13, %ymm10, %ymm10 +vpsubw %ymm10, %ymm6, %ymm6 +vmovdqu 592(%rdi), %ymm2 +vmovdqu 944(%rdi), %ymm4 +vmovdqu 1296(%rdi), %ymm3 +vpaddw %ymm11, %ymm2, %ymm11 +vpaddw %ymm6, %ymm4, %ymm6 +vpaddw %ymm7, %ymm3, %ymm7 +vpshufb shuf48_16(%rip), %ymm9, %ymm9 +vpand mask3_5_4_3_1(%rip), %ymm9, %ymm3 +vpand mask5_3_5_3(%rip), %ymm9, %ymm9 +vpermq $139, %ymm3, %ymm3 +vpand mask_keephigh(%rip), %ymm3, %ymm4 +vpor %ymm4, %ymm9, %ymm9 +vmovdqu 240(%rdi), %ymm4 +vpaddw 1984(%rsp), %ymm4, %ymm4 +vpaddw %ymm9, %ymm4, %ymm4 +vpand mask_mod8192(%rip), %ymm4, %ymm4 +vmovdqu %xmm4, 240(%rdi) +vextracti128 $1, %ymm4, %xmm4 +vmovq %xmm4, 256(%rdi) +vmovdqa %xmm3, 1984(%rsp) +vpshufb shuf48_16(%rip), %ymm8, %ymm8 +vpand mask3_5_4_3_1(%rip), %ymm8, %ymm3 +vpand mask5_3_5_3(%rip), %ymm8, %ymm8 +vpermq $139, %ymm3, %ymm3 +vpand mask_keephigh(%rip), %ymm3, %ymm4 +vpor %ymm4, %ymm8, %ymm8 +vpaddw 2240(%rsp), %ymm11, %ymm11 +vpaddw %ymm8, %ymm11, %ymm11 +vmovdqa %xmm3, 2240(%rsp) +vpshufb shuf48_16(%rip), %ymm10, %ymm10 +vpand mask3_5_4_3_1(%rip), %ymm10, %ymm3 +vpand mask5_3_5_3(%rip), %ymm10, %ymm10 +vpermq $139, %ymm3, %ymm3 +vpand mask_keephigh(%rip), %ymm3, %ymm4 +vpor %ymm4, %ymm10, %ymm10 +vpaddw 2496(%rsp), %ymm6, %ymm6 +vpaddw %ymm10, %ymm6, %ymm6 +vmovdqa %xmm3, 2496(%rsp) +vpshufb shuf48_16(%rip), %ymm5, %ymm5 +vpand mask3_5_4_3_1(%rip), %ymm5, %ymm3 +vpand mask5_3_5_3(%rip), %ymm5, %ymm5 +vpermq $139, %ymm3, %ymm3 +vpand mask_keephigh(%rip), %ymm3, %ymm4 +vpor %ymm4, %ymm5, %ymm5 +vpaddw 2752(%rsp), %ymm7, %ymm7 +vpaddw %ymm5, %ymm7, %ymm7 +vmovdqa %xmm3, 2752(%rsp) +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %xmm11, 592(%rdi) +vextracti128 $1, %ymm11, %xmm11 +vmovq %xmm11, 608(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %xmm6, 944(%rdi) +vextracti128 $1, %ymm6, %xmm6 +vmovq %xmm6, 960(%rdi) +vpand mask_mod8192(%rip), %ymm7, %ymm7 +vmovdqu %xmm7, 1296(%rdi) +vextracti128 $1, %ymm7, %xmm7 +vmovq %xmm7, 1312(%rdi) +vmovdqa 224(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm10 +vpunpckhwd const0(%rip), %ymm5, %ymm8 +vpslld $1, %ymm10, %ymm10 +vpslld $1, %ymm8, %ymm8 +vmovdqa 480(%rsp), %ymm9 +vpunpcklwd const0(%rip), %ymm9, %ymm7 +vpunpckhwd const0(%rip), %ymm9, %ymm9 +vmovdqa 736(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm11 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm11, %ymm7, %ymm3 +vpaddd %ymm6, %ymm9, %ymm4 +vpsubd %ymm10, %ymm3, %ymm3 +vpsubd %ymm8, %ymm4, %ymm4 +vpsubd %ymm11, %ymm7, %ymm11 +vpsubd %ymm6, %ymm9, %ymm6 +vpsrld $1, %ymm11, %ymm11 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm11, %ymm11 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm11, %ymm6 +vmovdqa 1760(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm9 +vpunpckhwd const0(%rip), %ymm11, %ymm7 +vpslld $1, %ymm9, %ymm9 +vpslld $1, %ymm7, %ymm7 +vpsubd %ymm9, %ymm3, %ymm3 +vpsubd %ymm7, %ymm4, %ymm4 +vpsrld $1, %ymm3, %ymm3 +vpsrld $1, %ymm4, %ymm4 +vpand mask32_to_16(%rip), %ymm3, %ymm3 +vpand mask32_to_16(%rip), %ymm4, %ymm4 +vpackusdw %ymm4, %ymm3, %ymm4 +vmovdqa 992(%rsp), %ymm3 +vpaddw 1248(%rsp), %ymm3, %ymm7 +vpsubw 1248(%rsp), %ymm3, %ymm3 +vpsrlw $2, %ymm3, %ymm3 +vpsubw %ymm6, %ymm3, %ymm3 +vpmullw %ymm14, %ymm3, %ymm3 +vpsllw $1, %ymm5, %ymm9 +vpsubw %ymm9, %ymm7, %ymm9 +vpsllw $7, %ymm11, %ymm7 +vpsubw %ymm7, %ymm9, %ymm7 +vpsrlw $3, %ymm7, %ymm7 +vpsubw %ymm4, %ymm7, %ymm7 +vmovdqa 1504(%rsp), %ymm9 +vpsubw %ymm5, %ymm9, %ymm9 +vpmullw %ymm15, %ymm11, %ymm8 +vpsubw %ymm8, %ymm9, %ymm8 +vpmullw %ymm14, %ymm7, %ymm7 +vpsubw %ymm7, %ymm4, %ymm4 +vpmullw %ymm12, %ymm7, %ymm9 +vpaddw %ymm9, %ymm4, %ymm9 +vpmullw %ymm12, %ymm9, %ymm9 +vpsubw %ymm9, %ymm8, %ymm9 +vpmullw %ymm14, %ymm9, %ymm9 +vpsubw %ymm6, %ymm9, %ymm9 +vpsrlw $3, %ymm9, %ymm9 +vpsubw %ymm3, %ymm9, %ymm9 +vpsubw %ymm9, %ymm3, %ymm3 +vpsubw %ymm3, %ymm6, %ymm6 +vpmullw %ymm13, %ymm9, %ymm9 +vpsubw %ymm9, %ymm6, %ymm6 +vextracti128 $1, %ymm4, %xmm8 +vpshufb shufmin1_mask3(%rip), %ymm8, %ymm8 +vmovdqa %ymm8, 2816(%rsp) +vextracti128 $1, %ymm3, %xmm8 +vpshufb shufmin1_mask3(%rip), %ymm8, %ymm8 +vmovdqa %ymm8, 2848(%rsp) +vextracti128 $1, %ymm7, %xmm8 +vpshufb shufmin1_mask3(%rip), %ymm8, %ymm8 +vmovdqa %ymm8, 2880(%rsp) +vmovdqu 680(%rdi), %ymm8 +vmovdqu 1032(%rdi), %ymm10 +vmovdqu 1384(%rdi), %ymm2 +vpaddw %ymm5, %ymm8, %ymm5 +vpaddw %ymm6, %ymm10, %ymm6 +vpaddw %ymm4, %ymm2, %ymm4 +vpshufb shuf48_16(%rip), %ymm3, %ymm3 +vpand mask3_5_4_3_1(%rip), %ymm3, %ymm2 +vpand mask5_3_5_3(%rip), %ymm3, %ymm3 +vpermq $139, %ymm2, %ymm2 +vpand mask_keephigh(%rip), %ymm2, %ymm10 +vpor %ymm10, %ymm3, %ymm3 +vmovdqu 328(%rdi), %ymm10 +vpaddw 2016(%rsp), %ymm10, %ymm10 +vpaddw %ymm3, %ymm10, %ymm10 +vpand mask_mod8192(%rip), %ymm10, %ymm10 +vmovdqu %xmm10, 328(%rdi) +vextracti128 $1, %ymm10, %xmm10 +vmovq %xmm10, 344(%rdi) +vpshufb shufmin1_mask3(%rip), %ymm10, %ymm10 +vmovdqa %xmm10, 1792(%rsp) +vmovdqa %xmm2, 2016(%rsp) +vpshufb shuf48_16(%rip), %ymm7, %ymm7 +vpand mask3_5_4_3_1(%rip), %ymm7, %ymm2 +vpand mask5_3_5_3(%rip), %ymm7, %ymm7 +vpermq $139, %ymm2, %ymm2 +vpand mask_keephigh(%rip), %ymm2, %ymm10 +vpor %ymm10, %ymm7, %ymm7 +vpaddw 2272(%rsp), %ymm5, %ymm5 +vpaddw %ymm7, %ymm5, %ymm5 +vmovdqa %xmm2, 2272(%rsp) +vpshufb shuf48_16(%rip), %ymm9, %ymm9 +vpand mask3_5_4_3_1(%rip), %ymm9, %ymm2 +vpand mask5_3_5_3(%rip), %ymm9, %ymm9 +vpermq $139, %ymm2, %ymm2 +vpand mask_keephigh(%rip), %ymm2, %ymm10 +vpor %ymm10, %ymm9, %ymm9 +vpaddw 2528(%rsp), %ymm6, %ymm6 +vpaddw %ymm9, %ymm6, %ymm6 +vmovdqa %xmm2, 2528(%rsp) +vpshufb shuf48_16(%rip), %ymm11, %ymm11 +vpand mask3_5_4_3_1(%rip), %ymm11, %ymm2 +vpand mask5_3_5_3(%rip), %ymm11, %ymm11 +vpermq $139, %ymm2, %ymm2 +vpand mask_keephigh(%rip), %ymm2, %ymm10 +vpor %ymm10, %ymm11, %ymm11 +vpaddw 2784(%rsp), %ymm4, %ymm4 +vpaddw %ymm11, %ymm4, %ymm4 +vmovdqa %xmm2, 2784(%rsp) +vpand mask_mod8192(%rip), %ymm5, %ymm5 +vmovdqu %xmm5, 680(%rdi) +vextracti128 $1, %ymm5, %xmm5 +vmovq %xmm5, 696(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %xmm6, 1032(%rdi) +vextracti128 $1, %ymm6, %xmm6 +vmovq %xmm6, 1048(%rdi) +vpand mask_mod8192(%rip), %ymm4, %ymm4 +vmovdqu %xmm4, 1384(%rdi) +vextracti128 $1, %ymm4, %xmm4 +vpextrw $0, %xmm4, 1400(%rdi) +vmovdqu 0(%rdi), %ymm11 +vpaddw 1888(%rsp), %ymm11, %ymm11 +vpaddw 2816(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 0(%rdi) +vmovdqu 352(%rdi), %ymm11 +vpaddw 2528(%rsp), %ymm11, %ymm11 +vpaddw 2848(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 352(%rdi) +vmovdqu 704(%rdi), %ymm11 +vpaddw 2784(%rsp), %ymm11, %ymm11 +vpaddw 2880(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 704(%rdi) +vmovdqu 88(%rdi), %ymm11 +vpaddw 2048(%rsp), %ymm11, %ymm11 +vpaddw 1920(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 88(%rdi) +vmovdqu 440(%rdi), %ymm11 +vpaddw 2304(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 440(%rdi) +vmovdqu 792(%rdi), %ymm11 +vpaddw 2560(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 792(%rdi) +vmovdqu 176(%rdi), %ymm11 +vpaddw 2080(%rsp), %ymm11, %ymm11 +vpaddw 1952(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 176(%rdi) +vmovdqu 528(%rdi), %ymm11 +vpaddw 2336(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 528(%rdi) +vmovdqu 880(%rdi), %ymm11 +vpaddw 2592(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 880(%rdi) +vmovdqu 264(%rdi), %ymm11 +vpaddw 2112(%rsp), %ymm11, %ymm11 +vpaddw 1984(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 264(%rdi) +vmovdqu 616(%rdi), %ymm11 +vpaddw 2368(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 616(%rdi) +vmovdqu 968(%rdi), %ymm11 +vpaddw 2624(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 968(%rdi) +vmovdqu 352(%rdi), %ymm11 +vpaddw 2144(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 352(%rdi) +vmovdqu 704(%rdi), %ymm11 +vpaddw 2400(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 704(%rdi) +vmovdqu 1056(%rdi), %ymm11 +vpaddw 2656(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 1056(%rdi) +vmovdqu 440(%rdi), %ymm11 +vpaddw 2176(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 440(%rdi) +vmovdqu 792(%rdi), %ymm11 +vpaddw 2432(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 792(%rdi) +vmovdqu 1144(%rdi), %ymm11 +vpaddw 2688(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 1144(%rdi) +vmovdqu 528(%rdi), %ymm11 +vpaddw 2208(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 528(%rdi) +vmovdqu 880(%rdi), %ymm11 +vpaddw 2464(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 880(%rdi) +vmovdqu 1232(%rdi), %ymm11 +vpaddw 2720(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 1232(%rdi) +vmovdqu 616(%rdi), %ymm11 +vpaddw 2240(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 616(%rdi) +vmovdqu 968(%rdi), %ymm11 +vpaddw 2496(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 968(%rdi) +vmovdqu 1320(%rdi), %ymm11 +vpaddw 2752(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 1320(%rdi) +mov %r8, %rsp +pop %r12 +pop %rbp +ret +.cfi_endproc + +#endif diff --git a/crypto/hrss/hrss.c b/crypto/hrss/hrss.c new file mode 100644 index 00000000..c059b834 --- /dev/null +++ b/crypto/hrss/hrss.c @@ -0,0 +1,2265 @@ +/* Copyright (c) 2018, Google Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64) +#include +#endif + +#if (defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) && \ + (defined(__ARM_NEON__) || defined(__ARM_NEON)) +#include +#endif + +#if defined(_MSC_VER) +#define RESTRICT +#else +#define RESTRICT restrict +#endif + +#include "../internal.h" +#include "internal.h" + +// This is an implementation of [HRSS], but with a KEM transformation based on +// [SXY]. The primary references are: + +// HRSS: https://eprint.iacr.org/2017/667.pdf +// HRSSNIST: +// https://csrc.nist.gov/CSRC/media/Projects/Post-Quantum-Cryptography/documents/round-1/submissions/NTRU_HRSS_KEM.zip +// SXY: https://eprint.iacr.org/2017/1005.pdf +// NTRUTN14: +// https://assets.onboardsecurity.com/static/downloads/NTRU/resources/NTRUTech014.pdf + + +// Vector operations. +// +// A couple of functions in this file can use vector operations to meaningful +// effect. If we're building for a target that has a supported vector unit, +// |HRSS_HAVE_VECTOR_UNIT| will be defined and |vec_t| will be typedefed to a +// 128-bit vector. The following functions abstract over the differences between +// NEON and SSE2 for implementing some vector operations. + +// TODO: MSVC can likely also be made to work with vector operations. +#if (defined(OPENSSL_X86) || defined(OPENSSL_X86_64)) && \ + (defined(__clang__) || !defined(_MSC_VER)) + +#define HRSS_HAVE_VECTOR_UNIT +typedef __m128i vec_t; + +// vec_capable returns one iff the current platform supports SSE2. +static int vec_capable(void) { +#if defined(__SSE2__) + return 1; +#else + int has_sse2 = (OPENSSL_ia32cap_P[0] & (1 << 26)) != 0; + return has_sse2; +#endif +} + +// vec_add performs a pair-wise addition of four uint16s from |a| and |b|. +static inline vec_t vec_add(vec_t a, vec_t b) { return _mm_add_epi16(a, b); } + +// vec_sub performs a pair-wise subtraction of four uint16s from |a| and |b|. +static inline vec_t vec_sub(vec_t a, vec_t b) { return _mm_sub_epi16(a, b); } + +// vec_mul multiplies each uint16_t in |a| by |b| and returns the resulting +// vector. +static inline vec_t vec_mul(vec_t a, uint16_t b) { + return _mm_mullo_epi16(a, _mm_set1_epi16(b)); +} + +// vec_fma multiplies each uint16_t in |b| by |c|, adds the result to |a|, and +// returns the resulting vector. +static inline vec_t vec_fma(vec_t a, vec_t b, uint16_t c) { + return _mm_add_epi16(a, _mm_mullo_epi16(b, _mm_set1_epi16(c))); +} + +// vec3_rshift_word right-shifts the 24 uint16_t's in |v| by one uint16. +static inline void vec3_rshift_word(vec_t v[3]) { + // Intel's left and right shifting is backwards compared to the order in + // memory because they're based on little-endian order of words (and not just + // bytes). So the shifts in this function will be backwards from what one + // might expect. + const __m128i carry0 = _mm_srli_si128(v[0], 14); + v[0] = _mm_slli_si128(v[0], 2); + + const __m128i carry1 = _mm_srli_si128(v[1], 14); + v[1] = _mm_slli_si128(v[1], 2); + v[1] |= carry0; + + v[2] = _mm_slli_si128(v[2], 2); + v[2] |= carry1; +} + +// vec4_rshift_word right-shifts the 32 uint16_t's in |v| by one uint16. +static inline void vec4_rshift_word(vec_t v[4]) { + // Intel's left and right shifting is backwards compared to the order in + // memory because they're based on little-endian order of words (and not just + // bytes). So the shifts in this function will be backwards from what one + // might expect. + const __m128i carry0 = _mm_srli_si128(v[0], 14); + v[0] = _mm_slli_si128(v[0], 2); + + const __m128i carry1 = _mm_srli_si128(v[1], 14); + v[1] = _mm_slli_si128(v[1], 2); + v[1] |= carry0; + + const __m128i carry2 = _mm_srli_si128(v[2], 14); + v[2] = _mm_slli_si128(v[2], 2); + v[2] |= carry1; + + v[3] = _mm_slli_si128(v[3], 2); + v[3] |= carry2; +} + +// vec_merge_3_5 takes the final three uint16_t's from |left|, appends the first +// five from |right|, and returns the resulting vector. +static inline vec_t vec_merge_3_5(vec_t left, vec_t right) { + return _mm_srli_si128(left, 10) | _mm_slli_si128(right, 6); +} + +// poly3_vec_lshift1 left-shifts the 768 bits in |a_s|, and in |a_a|, by one +// bit. +static inline void poly3_vec_lshift1(vec_t a_s[6], vec_t a_a[6]) { + vec_t carry_s = {0}; + vec_t carry_a = {0}; + + for (int i = 0; i < 6; i++) { + vec_t next_carry_s = _mm_srli_epi64(a_s[i], 63); + a_s[i] = _mm_slli_epi64(a_s[i], 1); + a_s[i] |= _mm_slli_si128(next_carry_s, 8); + a_s[i] |= carry_s; + carry_s = _mm_srli_si128(next_carry_s, 8); + + vec_t next_carry_a = _mm_srli_epi64(a_a[i], 63); + a_a[i] = _mm_slli_epi64(a_a[i], 1); + a_a[i] |= _mm_slli_si128(next_carry_a, 8); + a_a[i] |= carry_a; + carry_a = _mm_srli_si128(next_carry_a, 8); + } +} + +// poly3_vec_rshift1 right-shifts the 768 bits in |a_s|, and in |a_a|, by one +// bit. +static inline void poly3_vec_rshift1(vec_t a_s[6], vec_t a_a[6]) { + vec_t carry_s = {0}; + vec_t carry_a = {0}; + + for (int i = 5; i >= 0; i--) { + const vec_t next_carry_s = _mm_slli_epi64(a_s[i], 63); + a_s[i] = _mm_srli_epi64(a_s[i], 1); + a_s[i] |= _mm_srli_si128(next_carry_s, 8); + a_s[i] |= carry_s; + carry_s = _mm_slli_si128(next_carry_s, 8); + + const vec_t next_carry_a = _mm_slli_epi64(a_a[i], 63); + a_a[i] = _mm_srli_epi64(a_a[i], 1); + a_a[i] |= _mm_srli_si128(next_carry_a, 8); + a_a[i] |= carry_a; + carry_a = _mm_slli_si128(next_carry_a, 8); + } +} + +// vec_broadcast_bit duplicates the least-significant bit in |a| to all bits in +// a vector and returns the result. +static inline vec_t vec_broadcast_bit(vec_t a) { + return _mm_shuffle_epi32(_mm_srai_epi32(_mm_slli_epi64(a, 63), 31), + 0b01010101); +} + +// vec_broadcast_bit15 duplicates the most-significant bit of the first word in +// |a| to all bits in a vector and returns the result. +static inline vec_t vec_broadcast_bit15(vec_t a) { + return _mm_shuffle_epi32(_mm_srai_epi32(_mm_slli_epi64(a, 63 - 15), 31), + 0b01010101); +} + +// vec_get_word returns the |i|th uint16_t in |v|. (This is a macro because the +// compiler requires that |i| be a compile-time constant.) +#define vec_get_word(v, i) _mm_extract_epi16(v, i) + +#elif (defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) && \ + (defined(__ARM_NEON__) || defined(__ARM_NEON)) + +#define HRSS_HAVE_VECTOR_UNIT +typedef uint16x8_t vec_t; + +// These functions perform the same actions as the SSE2 function of the same +// name, above. + +static int vec_capable() { return CRYPTO_is_NEON_capable(); } + +static inline vec_t vec_add(vec_t a, vec_t b) { return a + b; } + +static inline vec_t vec_sub(vec_t a, vec_t b) { return a - b; } + +static inline vec_t vec_mul(vec_t a, uint16_t b) { return vmulq_n_u16(a, b); } + +static inline vec_t vec_fma(vec_t a, vec_t b, uint16_t c) { + return vmlaq_n_u16(a, b, c); +} + +static inline void vec3_rshift_word(vec_t v[3]) { + const uint16x8_t kZero = {0}; + v[2] = vextq_u16(v[1], v[2], 7); + v[1] = vextq_u16(v[0], v[1], 7); + v[0] = vextq_u16(kZero, v[0], 7); +} + +static inline void vec4_rshift_word(vec_t v[4]) { + const uint16x8_t kZero = {0}; + v[3] = vextq_u16(v[2], v[3], 7); + v[2] = vextq_u16(v[1], v[2], 7); + v[1] = vextq_u16(v[0], v[1], 7); + v[0] = vextq_u16(kZero, v[0], 7); +} + +static inline vec_t vec_merge_3_5(vec_t left, vec_t right) { + return vextq_u16(left, right, 5); +} + +static inline uint16_t vec_get_word(vec_t v, unsigned i) { + return v[i]; +} + +#if !defined(OPENSSL_AARCH64) + +static inline vec_t vec_broadcast_bit(vec_t a) { + a = (vec_t)vshrq_n_s16(((int16x8_t)a) << 15, 15); + return vdupq_lane_u16(vget_low_u16(a), 0); +} + +static inline vec_t vec_broadcast_bit15(vec_t a) { + a = (vec_t)vshrq_n_s16((int16x8_t)a, 15); + return vdupq_lane_u16(vget_low_u16(a), 0); +} + +static inline void poly3_vec_lshift1(vec_t a_s[6], vec_t a_a[6]) { + vec_t carry_s = {0}; + vec_t carry_a = {0}; + const vec_t kZero = {0}; + + for (int i = 0; i < 6; i++) { + vec_t next_carry_s = a_s[i] >> 15; + a_s[i] <<= 1; + a_s[i] |= vextq_u16(kZero, next_carry_s, 7); + a_s[i] |= carry_s; + carry_s = vextq_u16(next_carry_s, kZero, 7); + + vec_t next_carry_a = a_a[i] >> 15; + a_a[i] <<= 1; + a_a[i] |= vextq_u16(kZero, next_carry_a, 7); + a_a[i] |= carry_a; + carry_a = vextq_u16(next_carry_a, kZero, 7); + } +} + +static inline void poly3_vec_rshift1(vec_t a_s[6], vec_t a_a[6]) { + vec_t carry_s = {0}; + vec_t carry_a = {0}; + const vec_t kZero = {0}; + + for (int i = 5; i >= 0; i--) { + vec_t next_carry_s = a_s[i] << 15; + a_s[i] >>= 1; + a_s[i] |= vextq_u16(next_carry_s, kZero, 1); + a_s[i] |= carry_s; + carry_s = vextq_u16(kZero, next_carry_s, 1); + + vec_t next_carry_a = a_a[i] << 15; + a_a[i] >>= 1; + a_a[i] |= vextq_u16(next_carry_a, kZero, 1); + a_a[i] |= carry_a; + carry_a = vextq_u16(kZero, next_carry_a, 1); + } +} + +#endif // !OPENSSL_AARCH64 + +#endif // (ARM || AARCH64) && NEON + +// Polynomials in this scheme have N terms. +// #define N 701 + +// Underlying data types and arithmetic operations. +// ------------------------------------------------ + +// Binary polynomials. + +// poly2 represents a degree-N polynomial over GF(2). The words are in little- +// endian order, i.e. the coefficient of x^0 is the LSB of the first word. The +// final word is only partially used since N is not a multiple of the word size. + +// Defined in internal.h: +// struct poly2 { +// crypto_word_t v[WORDS_PER_POLY]; +// }; + +OPENSSL_UNUSED static void hexdump(const void *void_in, size_t len) { + const uint8_t *in = (const uint8_t *)void_in; + for (size_t i = 0; i < len; i++) { + printf("%02x", in[i]); + } + printf("\n"); +} + +static void poly2_zero(struct poly2 *p) { + OPENSSL_memset(&p->v[0], 0, sizeof(crypto_word_t) * WORDS_PER_POLY); +} + +// poly2_cmov sets |out| to |in| iff |mov| is all ones. +static void poly2_cmov(struct poly2 *out, const struct poly2 *in, + crypto_word_t mov) { + for (size_t i = 0; i < WORDS_PER_POLY; i++) { + out->v[i] = (out->v[i] & ~mov) | (in->v[i] & mov); + } +} + +// poly2_rotr_words performs a right-rotate on |in|, writing the result to +// |out|. The shift count, |bits|, must be a non-zero multiple of the word size. +static void poly2_rotr_words(struct poly2 *out, const struct poly2 *in, + size_t bits) { + assert(bits >= BITS_PER_WORD && bits % BITS_PER_WORD == 0); + assert(out != in); + + const size_t start = bits / BITS_PER_WORD; + const size_t n = (N - bits) / BITS_PER_WORD; + + // The rotate is by a whole number of words so the first few words are easy: + // just move them down. + for (size_t i = 0; i < n; i++) { + out->v[i] = in->v[start + i]; + } + + // Since the last word is only partially filled, however, the remainder needs + // shifting and merging of words to take care of that. + crypto_word_t carry = in->v[WORDS_PER_POLY - 1]; + + for (size_t i = 0; i < start; i++) { + out->v[n + i] = carry | in->v[i] << BITS_IN_LAST_WORD; + carry = in->v[i] >> (BITS_PER_WORD - BITS_IN_LAST_WORD); + } + + out->v[WORDS_PER_POLY - 1] = carry; +} + +// poly2_rotr_bits performs a right-rotate on |in|, writing the result to |out|. +// The shift count, |bits|, must be a power of two that is less than +// |BITS_PER_WORD|. +static void poly2_rotr_bits(struct poly2 *out, const struct poly2 *in, + size_t bits) { + assert(bits <= BITS_PER_WORD / 2); + assert(bits != 0); + assert((bits & (bits - 1)) == 0); + assert(out != in); + + // BITS_PER_WORD/2 is the greatest legal value of |bits|. If + // |BITS_IN_LAST_WORD| is smaller than this then the code below doesn't work + // because more than the last word needs to carry down in the previous one and + // so on. + OPENSSL_STATIC_ASSERT( + BITS_IN_LAST_WORD >= BITS_PER_WORD / 2, + "there are more carry bits than fit in BITS_IN_LAST_WORD"); + + crypto_word_t carry = in->v[WORDS_PER_POLY - 1] << (BITS_PER_WORD - bits); + + for (size_t i = WORDS_PER_POLY - 2; i < WORDS_PER_POLY; i--) { + out->v[i] = carry | in->v[i] >> bits; + carry = in->v[i] << (BITS_PER_WORD - bits); + } + + crypto_word_t last_word = carry >> (BITS_PER_WORD - BITS_IN_LAST_WORD) | + in->v[WORDS_PER_POLY - 1] >> bits; + last_word &= (UINT64_C(1) << BITS_IN_LAST_WORD) - 1; + out->v[WORDS_PER_POLY - 1] = last_word; +} + +// HRSS_poly2_rotr_consttime right-rotates |p| by |bits| in constant-time. +void HRSS_poly2_rotr_consttime(struct poly2 *p, size_t bits) { + assert(bits <= N); + assert(p->v[WORDS_PER_POLY-1] >> BITS_IN_LAST_WORD == 0); + + // Constant-time rotation is implemented by calculating the rotations of + // powers-of-two bits and throwing away the unneeded values. 2^9 (i.e. 512) is + // the largest power-of-two shift that we need to consider because 2^10 > N. +#define HRSS_POLY2_MAX_SHIFT 9 + size_t shift = HRSS_POLY2_MAX_SHIFT; + OPENSSL_STATIC_ASSERT((1 << (HRSS_POLY2_MAX_SHIFT + 1)) > N, + "maximum shift is too small"); + OPENSSL_STATIC_ASSERT((1 << HRSS_POLY2_MAX_SHIFT) <= N, + "maximum shift is too large"); + struct poly2 shifted; + + for (; (UINT64_C(1) << shift) >= BITS_PER_WORD; shift--) { + poly2_rotr_words(&shifted, p, UINT64_C(1) << shift); + poly2_cmov(p, &shifted, ~((1 & (bits >> shift)) - 1)); + } + + for (; shift < HRSS_POLY2_MAX_SHIFT; shift--) { + poly2_rotr_bits(&shifted, p, UINT64_C(1) << shift); + poly2_cmov(p, &shifted, ~((1 & (bits >> shift)) - 1)); + } +#undef HRSS_POLY2_MAX_SHIFT +} + +// poly2_cswap exchanges the values of |a| and |b| if |swap| is all ones. +static void poly2_cswap(struct poly2 *a, struct poly2 *b, crypto_word_t swap) { + for (size_t i = 0; i < WORDS_PER_POLY; i++) { + const crypto_word_t sum = swap & (a->v[i] ^ b->v[i]); + a->v[i] ^= sum; + b->v[i] ^= sum; + } +} + +// poly2_fmadd sets |out| to |out| + |in| * m, where m is either +// |CONSTTIME_TRUE_W| or |CONSTTIME_FALSE_W|. +static void poly2_fmadd(struct poly2 *out, const struct poly2 *in, + crypto_word_t m) { + for (size_t i = 0; i < WORDS_PER_POLY; i++) { + out->v[i] ^= in->v[i] & m; + } +} + +// poly2_lshift1 left-shifts |p| by one bit. +static void poly2_lshift1(struct poly2 *p) { + crypto_word_t carry = 0; + for (size_t i = 0; i < WORDS_PER_POLY; i++) { + const crypto_word_t next_carry = p->v[i] >> (BITS_PER_WORD - 1); + p->v[i] <<= 1; + p->v[i] |= carry; + carry = next_carry; + } +} + +// poly2_rshift1 right-shifts |p| by one bit. +static void poly2_rshift1(struct poly2 *p) { + crypto_word_t carry = 0; + for (size_t i = WORDS_PER_POLY - 1; i < WORDS_PER_POLY; i--) { + const crypto_word_t next_carry = p->v[i] & 1; + p->v[i] >>= 1; + p->v[i] |= carry << (BITS_PER_WORD - 1); + carry = next_carry; + } +} + +// poly2_clear_top_bits clears the bits in the final word that are only for +// alignment. +static void poly2_clear_top_bits(struct poly2 *p) { + p->v[WORDS_PER_POLY - 1] &= (UINT64_C(1) << BITS_IN_LAST_WORD) - 1; +} + +// poly2_top_bits_are_clear returns one iff the extra bits in the final words of +// |p| are zero. +static int poly2_top_bits_are_clear(const struct poly2 *p) { + return (p->v[WORDS_PER_POLY - 1] & + ~((UINT64_C(1) << BITS_IN_LAST_WORD) - 1)) == 0; +} + +// Ternary polynomials. + +// poly3 represents a degree-N polynomial over GF(3). Each coefficient is +// bitsliced across the |s| and |a| arrays, like this: +// +// s | a | value +// ----------------- +// 0 | 0 | 0 +// 0 | 1 | 1 +// 1 | 0 | 2 (aka -1) +// 1 | 1 | +// +// ('s' is for sign, and 'a' just a letter.) +// +// Once bitsliced as such, the following circuits can be used to implement +// addition and multiplication mod 3: +// +// (s3, a3) = (s1, a1) × (s2, a2) +// s3 = (a1 ∧ s2) ⊕ (s1 ∧ a2) +// a3 = (s1 ∧ s2) ⊕ (a1 ∧ a2) +// +// (s3, a3) = (s1, a1) + (s2, a2) +// x = (a1 ⊕ a2) +// y = (s1 ⊕ s2) ⊕ (a1 ∧ a2) +// z = (s1 ∧ s2) +// s3 = y ∧ ¬x +// a3 = z ∨ (x ∧ ¬y) +// +// Negating a value just involves swapping s and a. +// struct poly3 { +// struct poly2 s, a; +// }; + +OPENSSL_UNUSED static void poly3_print(const struct poly3 *in) { + struct poly3 p; + OPENSSL_memcpy(&p, in, sizeof(p)); + p.s.v[WORDS_PER_POLY - 1] &= ((crypto_word_t)1 << BITS_IN_LAST_WORD) - 1; + p.a.v[WORDS_PER_POLY - 1] &= ((crypto_word_t)1 << BITS_IN_LAST_WORD) - 1; + + printf("{["); + for (unsigned i = 0; i < WORDS_PER_POLY; i++) { + if (i) { + printf(" "); + } + printf(BN_HEX_FMT2, p.s.v[i]); + } + printf("] ["); + for (unsigned i = 0; i < WORDS_PER_POLY; i++) { + if (i) { + printf(" "); + } + printf(BN_HEX_FMT2, p.a.v[i]); + } + printf("]}\n"); +} + +static void poly3_zero(struct poly3 *p) { + poly2_zero(&p->s); + poly2_zero(&p->a); +} + +// lsb_to_all replicates the least-significant bit of |v| to all bits of the +// word. This is used in bit-slicing operations to make a vector from a fixed +// value. +static crypto_word_t lsb_to_all(crypto_word_t v) { return 0u - (v & 1); } + +// poly3_mul_const sets |p| to |p|×m, where m = (ms, ma). +static void poly3_mul_const(struct poly3 *p, crypto_word_t ms, + crypto_word_t ma) { + ms = lsb_to_all(ms); + ma = lsb_to_all(ma); + + for (size_t i = 0; i < WORDS_PER_POLY; i++) { + const crypto_word_t s = p->s.v[i]; + const crypto_word_t a = p->a.v[i]; + p->s.v[i] = (s & ma) ^ (ms & a); + p->a.v[i] = (ms & s) ^ (ma & a); + } +} + +// poly3_rotr_consttime right-rotates |p| by |bits| in constant-time. +static void poly3_rotr_consttime(struct poly3 *p, size_t bits) { + assert(bits <= N); + HRSS_poly2_rotr_consttime(&p->s, bits); + HRSS_poly2_rotr_consttime(&p->a, bits); +} + +// poly3_fmadd sets |out| to |out| + |in|×m, where m is (ms, ma). +static void poly3_fmadd(struct poly3 *RESTRICT out, + const struct poly3 *RESTRICT in, crypto_word_t ms, + crypto_word_t ma) { + // (See the multiplication and addition circuits given above.) + for (size_t i = 0; i < WORDS_PER_POLY; i++) { + const crypto_word_t s = in->s.v[i]; + const crypto_word_t a = in->a.v[i]; + const crypto_word_t product_s = (s & ma) ^ (ms & a); + const crypto_word_t product_a = (ms & s) ^ (ma & a); + + const crypto_word_t x = out->a.v[i] ^ product_a; + const crypto_word_t y = + (out->s.v[i] ^ product_s) ^ (out->a.v[i] & product_a); + const crypto_word_t z = (out->s.v[i] & product_s); + out->s.v[i] = y & ~x; + out->a.v[i] = z | (x & ~y); + } +} + +// final_bit_to_all replicates the bit in the final position of the last word to +// all the bits in the word. +static crypto_word_t final_bit_to_all(crypto_word_t v) { + return lsb_to_all(v >> (BITS_IN_LAST_WORD - 1)); +} + +// poly3_top_bits_are_clear returns one iff the extra bits in the final words of +// |p| are zero. +OPENSSL_UNUSED static int poly3_top_bits_are_clear(const struct poly3 *p) { + return poly2_top_bits_are_clear(&p->s) && poly2_top_bits_are_clear(&p->a); +} + +// poly3_mod_phiN reduces |p| by Φ(N). +static void poly3_mod_phiN(struct poly3 *p) { + // In order to reduce by Φ(N) we subtract by the value of the greatest + // coefficient. That's the same as adding the negative of its value. The + // negative of (s, a) is (a, s), so the arguments are swapped in the following + // two lines. + const crypto_word_t factor_s = final_bit_to_all(p->a.v[WORDS_PER_POLY - 1]); + const crypto_word_t factor_a = final_bit_to_all(p->s.v[WORDS_PER_POLY - 1]); + + for (size_t i = 0; i < WORDS_PER_POLY; i++) { + const crypto_word_t s = p->s.v[i]; + const crypto_word_t a = p->a.v[i]; + const crypto_word_t x = a ^ factor_a; + const crypto_word_t y = (s ^ factor_s) ^ (a & factor_a); + const crypto_word_t z = (s & factor_s); + p->s.v[i] = y & ~x; + p->a.v[i] = z | (x & ~y); + } + + poly2_clear_top_bits(&p->s); + poly2_clear_top_bits(&p->a); +} + +static void poly3_cswap(struct poly3 *a, struct poly3 *b, crypto_word_t swap) { + poly2_cswap(&a->s, &b->s, swap); + poly2_cswap(&a->a, &b->a, swap); +} + +static void poly3_lshift1(struct poly3 *p) { + poly2_lshift1(&p->s); + poly2_lshift1(&p->a); +} + +static void poly3_rshift1(struct poly3 *p) { + poly2_rshift1(&p->s); + poly2_rshift1(&p->a); +} + +// poly3_span represents a pointer into a poly3. +struct poly3_span { + crypto_word_t *s; + crypto_word_t *a; +}; + +// poly3_word_add sets (|out_s|, |out_a|) to (|s1|, |a1|) + (|s2|, |a2|). +static void poly3_word_add(crypto_word_t *out_s, crypto_word_t *out_a, + const crypto_word_t s1, const crypto_word_t a1, + const crypto_word_t s2, const crypto_word_t a2) { + const crypto_word_t x = a1 ^ a2; + const crypto_word_t y = (s1 ^ s2) ^ (a1 & a2); + const crypto_word_t z = s1 & s2; + *out_s = y & ~x; + *out_a = z | (x & ~y); +} + +// poly3_span_add adds |n| words of values from |a| and |b| and writes the +// result to |out|. +static void poly3_span_add(const struct poly3_span *out, + const struct poly3_span *a, + const struct poly3_span *b, size_t n) { + for (size_t i = 0; i < n; i++) { + poly3_word_add(&out->s[i], &out->a[i], a->s[i], a->a[i], b->s[i], b->a[i]); + } +} + +// poly3_span_sub subtracts |n| words of |b| from |n| words of |a|. +static void poly3_span_sub(const struct poly3_span *a, + const struct poly3_span *b, size_t n) { + for (size_t i = 0; i < n; i++) { + // Swapping |b->s| and |b->a| negates the value being added. + poly3_word_add(&a->s[i], &a->a[i], a->s[i], a->a[i], b->a[i], b->s[i]); + } +} + +// poly3_mul_aux is a recursive function that multiplies |n| words from |a| and +// |b| and writes 2×|n| words to |out|. Each call uses 2*ceil(n/2) elements of +// |scratch| and the function recurses, except if |n| == 1, when |scratch| isn't +// used and the recursion stops. For |n| in {11, 22}, the transitive total +// amount of |scratch| needed happens to be 2n+2. +static void poly3_mul_aux(const struct poly3_span *out, + const struct poly3_span *scratch, + const struct poly3_span *a, + const struct poly3_span *b, size_t n) { + if (n == 1) { + crypto_word_t r_s_low = 0, r_s_high = 0, r_a_low = 0, r_a_high = 0; + crypto_word_t b_s = b->s[0], b_a = b->a[0]; + const crypto_word_t a_s = a->s[0], a_a = a->a[0]; + + for (size_t i = 0; i < BITS_PER_WORD; i++) { + // Multiply (s, a) by the next value from (b_s, b_a). + const crypto_word_t v_s = lsb_to_all(b_s); + const crypto_word_t v_a = lsb_to_all(b_a); + b_s >>= 1; + b_a >>= 1; + + const crypto_word_t m_s = (v_s & a_a) ^ (a_s & v_a); + const crypto_word_t m_a = (a_s & v_s) ^ (a_a & v_a); + + if (i == 0) { + // Special case otherwise the code tries to shift by BITS_PER_WORD + // below, which is undefined. + r_s_low = m_s; + r_a_low = m_a; + continue; + } + + // Shift the multiplication result to the correct position. + const crypto_word_t m_s_low = m_s << i; + const crypto_word_t m_s_high = m_s >> (BITS_PER_WORD - i); + const crypto_word_t m_a_low = m_a << i; + const crypto_word_t m_a_high = m_a >> (BITS_PER_WORD - i); + + // Add into the result. + poly3_word_add(&r_s_low, &r_a_low, r_s_low, r_a_low, m_s_low, m_a_low); + poly3_word_add(&r_s_high, &r_a_high, r_s_high, r_a_high, m_s_high, + m_a_high); + } + + out->s[0] = r_s_low; + out->s[1] = r_s_high; + out->a[0] = r_a_low; + out->a[1] = r_a_high; + return; + } + + // Karatsuba multiplication. + // https://en.wikipedia.org/wiki/Karatsuba_algorithm + + // When |n| is odd, the two "halves" will have different lengths. The first + // is always the smaller. + const size_t low_len = n / 2; + const size_t high_len = n - low_len; + const struct poly3_span a_high = {&a->s[low_len], &a->a[low_len]}; + const struct poly3_span b_high = {&b->s[low_len], &b->a[low_len]}; + + // Store a_1 + a_0 in the first half of |out| and b_1 + b_0 in the second + // half. + const struct poly3_span a_cross_sum = *out; + const struct poly3_span b_cross_sum = {&out->s[high_len], &out->a[high_len]}; + poly3_span_add(&a_cross_sum, a, &a_high, low_len); + poly3_span_add(&b_cross_sum, b, &b_high, low_len); + if (high_len != low_len) { + a_cross_sum.s[low_len] = a_high.s[low_len]; + a_cross_sum.a[low_len] = a_high.a[low_len]; + b_cross_sum.s[low_len] = b_high.s[low_len]; + b_cross_sum.a[low_len] = b_high.a[low_len]; + } + + const struct poly3_span child_scratch = {&scratch->s[2 * high_len], + &scratch->a[2 * high_len]}; + const struct poly3_span out_mid = {&out->s[low_len], &out->a[low_len]}; + const struct poly3_span out_high = {&out->s[2 * low_len], + &out->a[2 * low_len]}; + + // Calculate (a_1 + a_0) × (b_1 + b_0) and write to scratch buffer. + poly3_mul_aux(scratch, &child_scratch, &a_cross_sum, &b_cross_sum, high_len); + // Calculate a_1 × b_1. + poly3_mul_aux(&out_high, &child_scratch, &a_high, &b_high, high_len); + // Calculate a_0 × b_0. + poly3_mul_aux(out, &child_scratch, a, b, low_len); + + // Subtract those last two products from the first. + poly3_span_sub(scratch, out, low_len * 2); + poly3_span_sub(scratch, &out_high, high_len * 2); + + // Add the middle product into the output. + poly3_span_add(&out_mid, &out_mid, scratch, high_len * 2); +} + +// HRSS_poly3_mul sets |*out| to |x|×|y| mod Φ(N). +void HRSS_poly3_mul(struct poly3 *out, const struct poly3 *x, + const struct poly3 *y) { + crypto_word_t prod_s[WORDS_PER_POLY * 2]; + crypto_word_t prod_a[WORDS_PER_POLY * 2]; + crypto_word_t scratch_s[WORDS_PER_POLY * 2 + 2]; + crypto_word_t scratch_a[WORDS_PER_POLY * 2 + 2]; + const struct poly3_span prod_span = {prod_s, prod_a}; + const struct poly3_span scratch_span = {scratch_s, scratch_a}; + const struct poly3_span x_span = {(crypto_word_t *)x->s.v, + (crypto_word_t *)x->a.v}; + const struct poly3_span y_span = {(crypto_word_t *)y->s.v, + (crypto_word_t *)y->a.v}; + + poly3_mul_aux(&prod_span, &scratch_span, &x_span, &y_span, WORDS_PER_POLY); + + // |prod| needs to be reduced mod (𝑥^n - 1), which just involves adding the + // upper-half to the lower-half. However, N is 701, which isn't a multiple of + // BITS_PER_WORD, so the upper-half vectors all have to be shifted before + // being added to the lower-half. + for (size_t i = 0; i < WORDS_PER_POLY; i++) { + crypto_word_t v_s = prod_s[WORDS_PER_POLY + i - 1] >> BITS_IN_LAST_WORD; + v_s |= prod_s[WORDS_PER_POLY + i] << (BITS_PER_WORD - BITS_IN_LAST_WORD); + crypto_word_t v_a = prod_a[WORDS_PER_POLY + i - 1] >> BITS_IN_LAST_WORD; + v_a |= prod_a[WORDS_PER_POLY + i] << (BITS_PER_WORD - BITS_IN_LAST_WORD); + + poly3_word_add(&out->s.v[i], &out->a.v[i], prod_s[i], prod_a[i], v_s, v_a); + } + + poly3_mod_phiN(out); +} + +#if defined(HRSS_HAVE_VECTOR_UNIT) && !defined(OPENSSL_AARCH64) + +// poly3_vec_cswap swaps (|a_s|, |a_a|) and (|b_s|, |b_a|) if |swap| is +// |0xff..ff|. Otherwise, |swap| must be zero. +static inline void poly3_vec_cswap(vec_t a_s[6], vec_t a_a[6], vec_t b_s[6], + vec_t b_a[6], const vec_t swap) { + for (int i = 0; i < 6; i++) { + const vec_t sum_s = swap & (a_s[i] ^ b_s[i]); + a_s[i] ^= sum_s; + b_s[i] ^= sum_s; + + const vec_t sum_a = swap & (a_a[i] ^ b_a[i]); + a_a[i] ^= sum_a; + b_a[i] ^= sum_a; + } +} + +// poly3_vec_fmadd adds (|ms|, |ma|) × (|b_s|, |b_a|) to (|a_s|, |a_a|). +static inline void poly3_vec_fmadd(vec_t a_s[6], vec_t a_a[6], vec_t b_s[6], + vec_t b_a[6], const vec_t ms, + const vec_t ma) { + for (int i = 0; i < 6; i++) { + const vec_t s = b_s[i]; + const vec_t a = b_a[i]; + const vec_t product_s = (s & ma) ^ (ms & a); + const vec_t product_a = (ms & s) ^ (ma & a); + + const vec_t x = a_a[i] ^ product_a; + const vec_t y = (a_s[i] ^ product_s) ^ (a_a[i] & product_a); + const vec_t z = (a_s[i] & product_s); + a_s[i] = y & ~x; + a_a[i] = z | (x & ~y); + } +} + +// poly3_invert_vec sets |*out| to |in|^-1, i.e. such that |out|×|in| == 1 mod +// Φ(N). +static void poly3_invert_vec(struct poly3 *out, const struct poly3 *in) { + // See the comment in |HRSS_poly3_invert| about this algorithm. In addition to + // the changes described there, this implementation attempts to use vector + // registers to speed up the computation. Even non-poly3 variables are held in + // vectors where possible to minimise the amount of data movement between + // the vector and general-purpose registers. + + vec_t b_s[6], b_a[6], c_s[6], c_a[6], f_s[6], f_a[6], g_s[6], g_a[6]; + const vec_t kZero = {0}; + const vec_t kOne = {1}; + static const uint8_t kOneBytes[sizeof(vec_t)] = {1}; + static const uint8_t kBottomSixtyOne[sizeof(vec_t)] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x1f}; + + memset(b_s, 0, sizeof(b_s)); + memcpy(b_a, kOneBytes, sizeof(kOneBytes)); + memset(&b_a[1], 0, 5 * sizeof(vec_t)); + + memset(c_s, 0, sizeof(c_s)); + memset(c_a, 0, sizeof(c_a)); + + f_s[5] = kZero; + memcpy(f_s, in->s.v, WORDS_PER_POLY * sizeof(crypto_word_t)); + f_a[5] = kZero; + memcpy(f_a, in->a.v, WORDS_PER_POLY * sizeof(crypto_word_t)); + + // Set g to all ones. + memset(g_s, 0, sizeof(g_s)); + memset(g_a, 0xff, 5 * sizeof(vec_t)); + memcpy(&g_a[5], kBottomSixtyOne, sizeof(kBottomSixtyOne)); + + vec_t deg_f = {N - 1}, deg_g = {N - 1}, rotation = kZero; + vec_t k = kOne; + vec_t f0s = {0}, f0a = {0}; + vec_t still_going; + memset(&still_going, 0xff, sizeof(still_going)); + + for (unsigned i = 0; i < 2 * (N - 1) - 1; i++) { + const vec_t s_a = vec_broadcast_bit( + still_going & ((f_a[0] & g_s[0]) ^ (f_s[0] & g_a[0]))); + const vec_t s_s = vec_broadcast_bit( + still_going & ((f_a[0] & g_a[0]) ^ (f_s[0] & g_s[0]))); + const vec_t should_swap = + (s_s | s_a) & vec_broadcast_bit15(deg_f - deg_g); + + poly3_vec_cswap(f_s, f_a, g_s, g_a, should_swap); + poly3_vec_fmadd(f_s, f_a, g_s, g_a, s_s, s_a); + poly3_vec_rshift1(f_s, f_a); + + poly3_vec_cswap(b_s, b_a, c_s, c_a, should_swap); + poly3_vec_fmadd(b_s, b_a, c_s, c_a, s_s, s_a); + poly3_vec_lshift1(c_s, c_a); + + const vec_t deg_sum = should_swap & (deg_f ^ deg_g); + deg_f ^= deg_sum; + deg_g ^= deg_sum; + + deg_f -= kOne; + still_going &= ~vec_broadcast_bit15(deg_f - kOne); + + const vec_t f0_is_nonzero = vec_broadcast_bit(f_s[0] | f_a[0]); + // |f0_is_nonzero| implies |still_going|. + rotation ^= f0_is_nonzero & (k ^ rotation); + k += kOne; + + const vec_t f0s_sum = f0_is_nonzero & (f_s[0] ^ f0s); + f0s ^= f0s_sum; + const vec_t f0a_sum = f0_is_nonzero & (f_a[0] ^ f0a); + f0a ^= f0a_sum; + } + + crypto_word_t rotation_word = vec_get_word(rotation, 0); + rotation_word -= N & constant_time_lt_w(N, rotation_word); + memcpy(out->s.v, b_s, WORDS_PER_POLY * sizeof(crypto_word_t)); + memcpy(out->a.v, b_a, WORDS_PER_POLY * sizeof(crypto_word_t)); + assert(poly3_top_bits_are_clear(out)); + poly3_rotr_consttime(out, rotation_word); + poly3_mul_const(out, vec_get_word(f0s, 0), vec_get_word(f0a, 0)); + poly3_mod_phiN(out); +} + +#endif // HRSS_HAVE_VECTOR_UNIT + +// HRSS_poly3_invert sets |*out| to |in|^-1, i.e. such that |out|×|in| == 1 mod +// Φ(N). +void HRSS_poly3_invert(struct poly3 *out, const struct poly3 *in) { + // The vector version of this function seems slightly slower on AArch64, but + // is useful on ARMv7 and x86-64. +#if defined(HRSS_HAVE_VECTOR_UNIT) && !defined(OPENSSL_AARCH64) + if (vec_capable()) { + poly3_invert_vec(out, in); + return; + } +#endif + + // This algorithm mostly follows algorithm 10 in the paper. Some changes: + // 1) k should start at zero, not one. In the code below k is omitted and + // the loop counter, |i|, is used instead. + // 2) The rotation count is conditionally updated to handle trailing zero + // coefficients. + // The best explanation for why it works is in the "Why it works" section of + // [NTRUTN14]. + + struct poly3 c, f, g; + OPENSSL_memcpy(&f, in, sizeof(f)); + + // Set g to all ones. + OPENSSL_memset(&g.s, 0, sizeof(struct poly2)); + OPENSSL_memset(&g.a, 0xff, sizeof(struct poly2)); + g.a.v[WORDS_PER_POLY - 1] >>= BITS_PER_WORD - BITS_IN_LAST_WORD; + + struct poly3 *b = out; + poly3_zero(b); + poly3_zero(&c); + // Set b to one. + b->a.v[0] = 1; + + crypto_word_t deg_f = N - 1, deg_g = N - 1, rotation = 0; + crypto_word_t f0s = 0, f0a = 0; + crypto_word_t still_going = CONSTTIME_TRUE_W; + + for (unsigned i = 0; i < 2 * (N - 1) - 1; i++) { + const crypto_word_t s_a = lsb_to_all( + still_going & ((f.a.v[0] & g.s.v[0]) ^ (f.s.v[0] & g.a.v[0]))); + const crypto_word_t s_s = lsb_to_all( + still_going & ((f.a.v[0] & g.a.v[0]) ^ (f.s.v[0] & g.s.v[0]))); + const crypto_word_t should_swap = + (s_s | s_a) & constant_time_lt_w(deg_f, deg_g); + + poly3_cswap(&f, &g, should_swap); + poly3_cswap(b, &c, should_swap); + + const crypto_word_t deg_sum = should_swap & (deg_f ^ deg_g); + deg_f ^= deg_sum; + deg_g ^= deg_sum; + assert(deg_g >= 1); + + poly3_fmadd(&f, &g, s_s, s_a); + poly3_fmadd(b, &c, s_s, s_a); + poly3_rshift1(&f); + poly3_lshift1(&c); + + deg_f--; + const crypto_word_t f0_is_nonzero = + lsb_to_all(f.s.v[0]) | lsb_to_all(f.a.v[0]); + // |f0_is_nonzero| implies |still_going|. + assert(!(f0_is_nonzero && !still_going)); + still_going &= ~constant_time_is_zero_w(deg_f); + + rotation = constant_time_select_w(f0_is_nonzero, i, rotation); + f0s = constant_time_select_w(f0_is_nonzero, f.s.v[0], f0s); + f0a = constant_time_select_w(f0_is_nonzero, f.a.v[0], f0a); + } + + rotation++; + rotation -= N & constant_time_lt_w(N, rotation); + assert(poly3_top_bits_are_clear(out)); + poly3_rotr_consttime(out, rotation); + poly3_mul_const(out, f0s, f0a); + poly3_mod_phiN(out); +} + +// Polynomials in Q. + +// Coefficients are reduced mod Q. (Q is clearly not prime, therefore the +// coefficients do not form a field.) +#define Q 8192 + +// VECS_PER_POLY is the number of 128-bit vectors needed to represent a +// polynomial. +#define COEFFICIENTS_PER_VEC (sizeof(vec_t) / sizeof(uint16_t)) +#define VECS_PER_POLY ((N + COEFFICIENTS_PER_VEC - 1) / COEFFICIENTS_PER_VEC) + +// poly represents a polynomial with coefficients mod Q. Note that, while Q is a +// power of two, this does not operate in GF(Q). That would be a binary field +// but this is simply mod Q. Thus the coefficients are not a field. +// +// Coefficients are ordered little-endian, thus the coefficient of x^0 is the +// first element of the array. +struct poly { +#if defined(HRSS_HAVE_VECTOR_UNIT) + union { + // N + 3 = 704, which is a multiple of 64 and thus aligns things, esp for + // the vector code. + uint16_t v[N + 3]; + vec_t vectors[VECS_PER_POLY]; + }; +#else + uint16_t v[N + 3]; +#endif +}; + +OPENSSL_UNUSED static void poly_print(const struct poly *p) { + printf("["); + for (unsigned i = 0; i < N; i++) { + if (i) { + printf(" "); + } + printf("%d", p->v[i]); + } + printf("]\n"); +} + +#if defined(HRSS_HAVE_VECTOR_UNIT) + +// poly_mul_vec_aux is a recursive function that multiplies |n| words from |a| +// and |b| and writes 2×|n| words to |out|. Each call uses 2*ceil(n/2) elements +// of |scratch| and the function recurses, except if |n| < 3, when |scratch| +// isn't used and the recursion stops. If |n| == |VECS_PER_POLY| then |scratch| +// needs 172 elements. +static void poly_mul_vec_aux(vec_t *restrict out, vec_t *restrict scratch, + const vec_t *restrict a, const vec_t *restrict b, + const size_t n) { + // In [HRSS], the technique they used for polynomial multiplication is + // described: they start with Toom-4 at the top level and then two layers of + // Karatsuba. Karatsuba is a specific instance of the general Toom–Cook + // decomposition, which splits an input n-ways and produces 2n-1 + // multiplications of those parts. So, starting with 704 coefficients (rounded + // up from 701 to have more factors of two), Toom-4 gives seven + // multiplications of degree-174 polynomials. Each round of Karatsuba (which + // is Toom-2) increases the number of multiplications by a factor of three + // while halving the size of the values being multiplied. So two rounds gives + // 63 multiplications of degree-44 polynomials. Then they (I think) form + // vectors by gathering all 63 coefficients of each power together, for each + // input, and doing more rounds of Karatsuba on the vectors until they bottom- + // out somewhere with schoolbook multiplication. + // + // I tried something like that for NEON. NEON vectors are 128 bits so hold + // eight coefficients. I wrote a function that did Karatsuba on eight + // multiplications at the same time, using such vectors, and a Go script that + // decomposed from degree-704, with Karatsuba in non-transposed form, until it + // reached multiplications of degree-44. It batched up those 81 + // multiplications into lots of eight with a single one left over (which was + // handled directly). + // + // It worked, but it was significantly slower than the dumb algorithm used + // below. Potentially that was because I misunderstood how [HRSS] did it, or + // because Clang is bad at generating good code from NEON intrinsics on ARMv7. + // (Which is true: the code generated by Clang for the below is pretty crap.) + // + // This algorithm is much simpler. It just does Karatsuba decomposition all + // the way down and never transposes. When it gets down to degree-16 or + // degree-24 values, they are multiplied using schoolbook multiplication and + // vector intrinsics. The vector operations form each of the eight phase- + // shifts of one of the inputs, point-wise multiply, and then add into the + // result at the correct place. This means that 33% (degree-16) or 25% + // (degree-24) of the multiplies and adds are wasted, but it does ok. + if (n == 2) { + vec_t result[4]; + vec_t vec_a[3]; + static const vec_t kZero = {0}; + vec_a[0] = a[0]; + vec_a[1] = a[1]; + vec_a[2] = kZero; + + result[0] = vec_mul(vec_a[0], vec_get_word(b[0], 0)); + result[1] = vec_mul(vec_a[1], vec_get_word(b[0], 0)); + + result[1] = vec_fma(result[1], vec_a[0], vec_get_word(b[1], 0)); + result[2] = vec_mul(vec_a[1], vec_get_word(b[1], 0)); + result[3] = kZero; + + vec3_rshift_word(vec_a); + +#define BLOCK(x, y) \ + do { \ + result[x + 0] = \ + vec_fma(result[x + 0], vec_a[0], vec_get_word(b[y / 8], y % 8)); \ + result[x + 1] = \ + vec_fma(result[x + 1], vec_a[1], vec_get_word(b[y / 8], y % 8)); \ + result[x + 2] = \ + vec_fma(result[x + 2], vec_a[2], vec_get_word(b[y / 8], y % 8)); \ + } while (0) + + BLOCK(0, 1); + BLOCK(1, 9); + + vec3_rshift_word(vec_a); + + BLOCK(0, 2); + BLOCK(1, 10); + + vec3_rshift_word(vec_a); + + BLOCK(0, 3); + BLOCK(1, 11); + + vec3_rshift_word(vec_a); + + BLOCK(0, 4); + BLOCK(1, 12); + + vec3_rshift_word(vec_a); + + BLOCK(0, 5); + BLOCK(1, 13); + + vec3_rshift_word(vec_a); + + BLOCK(0, 6); + BLOCK(1, 14); + + vec3_rshift_word(vec_a); + + BLOCK(0, 7); + BLOCK(1, 15); + +#undef BLOCK + + memcpy(out, result, sizeof(result)); + return; + } + + if (n == 3) { + vec_t result[6]; + vec_t vec_a[4]; + static const vec_t kZero = {0}; + vec_a[0] = a[0]; + vec_a[1] = a[1]; + vec_a[2] = a[2]; + vec_a[3] = kZero; + + result[0] = vec_mul(a[0], vec_get_word(b[0], 0)); + result[1] = vec_mul(a[1], vec_get_word(b[0], 0)); + result[2] = vec_mul(a[2], vec_get_word(b[0], 0)); + +#define BLOCK_PRE(x, y) \ + do { \ + result[x + 0] = \ + vec_fma(result[x + 0], vec_a[0], vec_get_word(b[y / 8], y % 8)); \ + result[x + 1] = \ + vec_fma(result[x + 1], vec_a[1], vec_get_word(b[y / 8], y % 8)); \ + result[x + 2] = vec_mul(vec_a[2], vec_get_word(b[y / 8], y % 8)); \ + } while (0) + + BLOCK_PRE(1, 8); + BLOCK_PRE(2, 16); + + result[5] = kZero; + + vec4_rshift_word(vec_a); + +#define BLOCK(x, y) \ + do { \ + result[x + 0] = \ + vec_fma(result[x + 0], vec_a[0], vec_get_word(b[y / 8], y % 8)); \ + result[x + 1] = \ + vec_fma(result[x + 1], vec_a[1], vec_get_word(b[y / 8], y % 8)); \ + result[x + 2] = \ + vec_fma(result[x + 2], vec_a[2], vec_get_word(b[y / 8], y % 8)); \ + result[x + 3] = \ + vec_fma(result[x + 3], vec_a[3], vec_get_word(b[y / 8], y % 8)); \ + } while (0) + + BLOCK(0, 1); + BLOCK(1, 9); + BLOCK(2, 17); + + vec4_rshift_word(vec_a); + + BLOCK(0, 2); + BLOCK(1, 10); + BLOCK(2, 18); + + vec4_rshift_word(vec_a); + + BLOCK(0, 3); + BLOCK(1, 11); + BLOCK(2, 19); + + vec4_rshift_word(vec_a); + + BLOCK(0, 4); + BLOCK(1, 12); + BLOCK(2, 20); + + vec4_rshift_word(vec_a); + + BLOCK(0, 5); + BLOCK(1, 13); + BLOCK(2, 21); + + vec4_rshift_word(vec_a); + + BLOCK(0, 6); + BLOCK(1, 14); + BLOCK(2, 22); + + vec4_rshift_word(vec_a); + + BLOCK(0, 7); + BLOCK(1, 15); + BLOCK(2, 23); + +#undef BLOCK +#undef BLOCK_PRE + + memcpy(out, result, sizeof(result)); + + return; + } + + // Karatsuba multiplication. + // https://en.wikipedia.org/wiki/Karatsuba_algorithm + + // When |n| is odd, the two "halves" will have different lengths. The first is + // always the smaller. + const size_t low_len = n / 2; + const size_t high_len = n - low_len; + const vec_t *a_high = &a[low_len]; + const vec_t *b_high = &b[low_len]; + + // Store a_1 + a_0 in the first half of |out| and b_1 + b_0 in the second + // half. + for (size_t i = 0; i < low_len; i++) { + out[i] = vec_add(a_high[i], a[i]); + out[high_len + i] = vec_add(b_high[i], b[i]); + } + if (high_len != low_len) { + out[low_len] = a_high[low_len]; + out[high_len + low_len] = b_high[low_len]; + } + + vec_t *const child_scratch = &scratch[2 * high_len]; + // Calculate (a_1 + a_0) × (b_1 + b_0) and write to scratch buffer. + poly_mul_vec_aux(scratch, child_scratch, out, &out[high_len], high_len); + // Calculate a_1 × b_1. + poly_mul_vec_aux(&out[low_len * 2], child_scratch, a_high, b_high, high_len); + // Calculate a_0 × b_0. + poly_mul_vec_aux(out, child_scratch, a, b, low_len); + + // Subtract those last two products from the first. + for (size_t i = 0; i < low_len * 2; i++) { + scratch[i] = vec_sub(scratch[i], vec_add(out[i], out[low_len * 2 + i])); + } + if (low_len != high_len) { + scratch[low_len * 2] = vec_sub(scratch[low_len * 2], out[low_len * 4]); + scratch[low_len * 2 + 1] = + vec_sub(scratch[low_len * 2 + 1], out[low_len * 4 + 1]); + } + + // Add the middle product into the output. + for (size_t i = 0; i < high_len * 2; i++) { + out[low_len + i] = vec_add(out[low_len + i], scratch[i]); + } +} + +// poly_mul_vec sets |*out| to |x|×|y| mod (𝑥^n - 1). +static void poly_mul_vec(struct poly *out, const struct poly *x, + const struct poly *y) { + OPENSSL_memset((uint16_t *)&x->v[N], 0, 3 * sizeof(uint16_t)); + OPENSSL_memset((uint16_t *)&y->v[N], 0, 3 * sizeof(uint16_t)); + + OPENSSL_STATIC_ASSERT(sizeof(out->v) == sizeof(vec_t) * VECS_PER_POLY, + "struct poly is the wrong size"); + OPENSSL_STATIC_ASSERT(alignof(struct poly) == alignof(vec_t), + "struct poly has incorrect alignment"); + + vec_t prod[VECS_PER_POLY * 2]; + vec_t scratch[172]; + poly_mul_vec_aux(prod, scratch, x->vectors, y->vectors, VECS_PER_POLY); + + // |prod| needs to be reduced mod (𝑥^n - 1), which just involves adding the + // upper-half to the lower-half. However, N is 701, which isn't a multiple of + // the vector size, so the upper-half vectors all have to be shifted before + // being added to the lower-half. + vec_t *out_vecs = (vec_t *)out->v; + + for (size_t i = 0; i < VECS_PER_POLY; i++) { + const vec_t prev = prod[VECS_PER_POLY - 1 + i]; + const vec_t this = prod[VECS_PER_POLY + i]; + out_vecs[i] = vec_add(prod[i], vec_merge_3_5(prev, this)); + } + + OPENSSL_memset(&out->v[N], 0, 3 * sizeof(uint16_t)); +} + +#endif // HRSS_HAVE_VECTOR_UNIT + +// poly_mul_novec_aux writes the product of |a| and |b| to |out|, using +// |scratch| as scratch space. It'll use Karatsuba if the inputs are large +// enough to warrant it. Each call uses 2*ceil(n/2) elements of |scratch| and +// the function recurses, except if |n| < 64, when |scratch| isn't used and the +// recursion stops. If |n| == |N| then |scratch| needs 1318 elements. +static void poly_mul_novec_aux(uint16_t *out, uint16_t *scratch, + const uint16_t *a, const uint16_t *b, size_t n) { + static const size_t kSchoolbookLimit = 64; + if (n < kSchoolbookLimit) { + OPENSSL_memset(out, 0, sizeof(uint16_t) * n * 2); + for (size_t i = 0; i < n; i++) { + for (size_t j = 0; j < n; j++) { + out[i + j] += (unsigned) a[i] * b[j]; + } + } + + return; + } + + // Karatsuba multiplication. + // https://en.wikipedia.org/wiki/Karatsuba_algorithm + + // When |n| is odd, the two "halves" will have different lengths. The + // first is always the smaller. + const size_t low_len = n / 2; + const size_t high_len = n - low_len; + const uint16_t *const a_high = &a[low_len]; + const uint16_t *const b_high = &b[low_len]; + + for (size_t i = 0; i < low_len; i++) { + out[i] = a_high[i] + a[i]; + out[high_len + i] = b_high[i] + b[i]; + } + if (high_len != low_len) { + out[low_len] = a_high[low_len]; + out[high_len + low_len] = b_high[low_len]; + } + + uint16_t *const child_scratch = &scratch[2 * high_len]; + poly_mul_novec_aux(scratch, child_scratch, out, &out[high_len], high_len); + poly_mul_novec_aux(&out[low_len * 2], child_scratch, a_high, b_high, + high_len); + poly_mul_novec_aux(out, child_scratch, a, b, low_len); + + for (size_t i = 0; i < low_len * 2; i++) { + scratch[i] -= out[i] + out[low_len * 2 + i]; + } + if (low_len != high_len) { + scratch[low_len * 2] -= out[low_len * 4]; + assert(out[low_len * 4 + 1] == 0); + } + + for (size_t i = 0; i < high_len * 2; i++) { + out[low_len + i] += scratch[i]; + } +} + +// poly_mul_novec sets |*out| to |x|×|y| mod (𝑥^n - 1). +static void poly_mul_novec(struct poly *out, const struct poly *x, + const struct poly *y) { + uint16_t prod[2 * N]; + uint16_t scratch[1318]; + poly_mul_novec_aux(prod, scratch, x->v, y->v, N); + + for (size_t i = 0; i < N; i++) { + out->v[i] = prod[i] + prod[i + N]; + } + OPENSSL_memset(&out->v[N], 0, 3 * sizeof(uint16_t)); +} + +// On x86-64, we can use the AVX2 code from [HRSS]. (The authors have given +// explicit permission for this and signed a CLA.) However it's 57KB of object +// code, so it's not used if |OPENSSL_SMALL| is defined. +#if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_SMALL) && \ + defined(OPENSSL_X86_64) && defined(OPENSSL_LINUX) +// poly_Rq_mul is defined in assembly. +extern void poly_Rq_mul(struct poly *r, const struct poly *a, + const struct poly *b); +#endif + +// The file cannot always be built with -mfpu=neon on ARMv7 because that would +// enable NEON instructions everywhere, not just in functions guarded by a +// runtime check for NEON capability. Therefore on ARMv7, if -mfpu=neon isn't +// used, a version of the vector code that has been precompiled and checked-in +// as assembly sources is used. (For AArch64, NEON is assumed to be provided.) +#if defined(OPENSSL_ARM) && !defined(HRSS_HAVE_VECTOR_UNIT) +// poly_mul_vec is defined in assembly. +extern void poly_mul_vec(struct poly *out, const struct poly *x, + const struct poly *y); +#endif + +static void poly_mul(struct poly *r, const struct poly *a, + const struct poly *b) { +#if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_SMALL) && \ + defined(OPENSSL_X86_64) && defined(OPENSSL_LINUX) + const int has_avx2 = (OPENSSL_ia32cap_P[2] & (1 << 5)) != 0; + if (has_avx2) { + poly_Rq_mul(r, a, b); + return; + } +#endif + +#if defined(HRSS_HAVE_VECTOR_UNIT) + if (vec_capable()) { + poly_mul_vec(r, a, b); + return; + } +#endif + +#if defined(OPENSSL_ARM) && !defined(HRSS_HAVE_VECTOR_UNIT) + // See above about this call. + if (CRYPTO_is_NEON_capable()) { + poly_mul_vec(r, a, b); + return; + } +#endif + + // Fallback, non-vector case. + poly_mul_novec(r, a, b); +} + +// poly_mul_x_minus_1 sets |p| to |p|×(𝑥 - 1) mod (𝑥^n - 1). +static void poly_mul_x_minus_1(struct poly *p) { + // Multiplying by (𝑥 - 1) means negating each coefficient and adding in + // the value of the previous one. + const uint16_t orig_final_coefficient = p->v[N - 1]; + + for (size_t i = N - 1; i > 0; i--) { + p->v[i] = p->v[i - 1] - p->v[i]; + } + p->v[0] = orig_final_coefficient - p->v[0]; +} + +// poly_mod_phiN sets |p| to |p| mod Φ(N). +static void poly_mod_phiN(struct poly *p) { + const uint16_t coeff700 = p->v[N - 1]; + + for (unsigned i = 0; i < N; i++) { + p->v[i] -= coeff700; + } +} + +// poly_clamp reduces each coefficient mod Q. +static void poly_clamp(struct poly *p) { + for (unsigned i = 0; i < N; i++) { + p->v[i] &= Q - 1; + } +} + + +// Conversion functions +// -------------------- + +// poly2_from_poly sets |*out| to |in| mod 2. +static void poly2_from_poly(struct poly2 *out, const struct poly *in) { + crypto_word_t *words = out->v; + unsigned shift = 0; + crypto_word_t word = 0; + + for (unsigned i = 0; i < N; i++) { + word >>= 1; + word |= (crypto_word_t)(in->v[i] & 1) << (BITS_PER_WORD - 1); + shift++; + + if (shift == BITS_PER_WORD) { + *words = word; + words++; + word = 0; + shift = 0; + } + } + + word >>= BITS_PER_WORD - shift; + *words = word; +} + +// mod3 treats |a| is a signed number and returns |a| mod 3. +static uint16_t mod3(int16_t a) { + const int16_t q = ((int32_t)a * 21845) >> 16; + int16_t ret = a - 3 * q; + // At this point, |ret| is in {0, 1, 2, 3} and that needs to be mapped to {0, + // 1, 2, 0}. + return ret & ((ret & (ret >> 1)) - 1); +} + +// poly3_from_poly sets |*out| to |in|. +static void poly3_from_poly(struct poly3 *out, const struct poly *in) { + crypto_word_t *words_s = out->s.v; + crypto_word_t *words_a = out->a.v; + crypto_word_t s = 0; + crypto_word_t a = 0; + unsigned shift = 0; + + for (unsigned i = 0; i < N; i++) { + // This duplicates the 13th bit upwards to the top of the uint16, + // essentially treating it as a sign bit and converting into a signed int16. + // The signed value is reduced mod 3, yielding {0, 1, 2}. + const uint16_t v = mod3((int16_t)(in->v[i] << 3) >> 3); + s >>= 1; + s |= (crypto_word_t)(v & 2) << (BITS_PER_WORD - 2); + a >>= 1; + a |= (crypto_word_t)(v & 1) << (BITS_PER_WORD - 1); + shift++; + + if (shift == BITS_PER_WORD) { + *words_s = s; + words_s++; + *words_a = a; + words_a++; + s = a = 0; + shift = 0; + } + } + + s >>= BITS_PER_WORD - shift; + a >>= BITS_PER_WORD - shift; + *words_s = s; + *words_a = a; +} + +// poly3_from_poly_checked sets |*out| to |in|, which has coefficients in {0, 1, +// Q-1}. It returns a mask indicating whether all coefficients were found to be +// in that set. +static crypto_word_t poly3_from_poly_checked(struct poly3 *out, + const struct poly *in) { + crypto_word_t *words_s = out->s.v; + crypto_word_t *words_a = out->a.v; + crypto_word_t s = 0; + crypto_word_t a = 0; + unsigned shift = 0; + crypto_word_t ok = CONSTTIME_TRUE_W; + + for (unsigned i = 0; i < N; i++) { + const uint16_t v = in->v[i]; + // Maps {0, 1, Q-1} to {0, 1, 2}. + uint16_t mod3 = v & 3; + mod3 ^= mod3 >> 1; + const uint16_t expected = (uint16_t)((~((mod3 >> 1) - 1)) | mod3) % Q; + ok &= constant_time_eq_w(v, expected); + + s >>= 1; + s |= (crypto_word_t)(mod3 & 2) << (BITS_PER_WORD - 2); + a >>= 1; + a |= (crypto_word_t)(mod3 & 1) << (BITS_PER_WORD - 1); + shift++; + + if (shift == BITS_PER_WORD) { + *words_s = s; + words_s++; + *words_a = a; + words_a++; + s = a = 0; + shift = 0; + } + } + + s >>= BITS_PER_WORD - shift; + a >>= BITS_PER_WORD - shift; + *words_s = s; + *words_a = a; + + return ok; +} + +static void poly_from_poly2(struct poly *out, const struct poly2 *in) { + const crypto_word_t *words = in->v; + unsigned shift = 0; + crypto_word_t word = *words; + + for (unsigned i = 0; i < N; i++) { + out->v[i] = word & 1; + word >>= 1; + shift++; + + if (shift == BITS_PER_WORD) { + words++; + word = *words; + shift = 0; + } + } +} + +static void poly_from_poly3(struct poly *out, const struct poly3 *in) { + const crypto_word_t *words_s = in->s.v; + const crypto_word_t *words_a = in->a.v; + crypto_word_t word_s = ~(*words_s); + crypto_word_t word_a = *words_a; + unsigned shift = 0; + + for (unsigned i = 0; i < N; i++) { + out->v[i] = (uint16_t)(word_s & 1) - 1; + out->v[i] |= word_a & 1; + word_s >>= 1; + word_a >>= 1; + shift++; + + if (shift == BITS_PER_WORD) { + words_s++; + words_a++; + word_s = ~(*words_s); + word_a = *words_a; + shift = 0; + } + } +} + +// Polynomial inversion +// -------------------- + +// poly_invert_mod2 sets |*out| to |in^-1| (i.e. such that |*out|×|in| = 1 mod +// Φ(N)), all mod 2. This isn't useful in itself, but is part of doing inversion +// mod Q. +static void poly_invert_mod2(struct poly *out, const struct poly *in) { + // This algorithm follows algorithm 10 in the paper. (Although, in contrast to + // the paper, k should start at zero, not one, and the rotation count is needs + // to handle trailing zero coefficients.) The best explanation for why it + // works is in the "Why it works" section of [NTRUTN14]. + + struct poly2 b, c, f, g; + poly2_from_poly(&f, in); + OPENSSL_memset(&b, 0, sizeof(b)); + b.v[0] = 1; + OPENSSL_memset(&c, 0, sizeof(c)); + + // Set g to all ones. + OPENSSL_memset(&g, 0xff, sizeof(struct poly2)); + g.v[WORDS_PER_POLY - 1] >>= BITS_PER_WORD - BITS_IN_LAST_WORD; + + crypto_word_t deg_f = N - 1, deg_g = N - 1, rotation = 0; + crypto_word_t still_going = CONSTTIME_TRUE_W; + + for (unsigned i = 0; i < 2 * (N - 1) - 1; i++) { + const crypto_word_t s = still_going & lsb_to_all(f.v[0]); + const crypto_word_t should_swap = s & constant_time_lt_w(deg_f, deg_g); + poly2_cswap(&f, &g, should_swap); + poly2_cswap(&b, &c, should_swap); + const crypto_word_t deg_sum = should_swap & (deg_f ^ deg_g); + deg_f ^= deg_sum; + deg_g ^= deg_sum; + assert(deg_g >= 1); + poly2_fmadd(&f, &g, s); + poly2_fmadd(&b, &c, s); + + poly2_rshift1(&f); + poly2_lshift1(&c); + + deg_f--; + const crypto_word_t f0_is_nonzero = lsb_to_all(f.v[0]); + // |f0_is_nonzero| implies |still_going|. + assert(!(f0_is_nonzero && !still_going)); + rotation = constant_time_select_w(f0_is_nonzero, i, rotation); + still_going &= ~constant_time_is_zero_w(deg_f); + } + + rotation++; + rotation -= N & constant_time_lt_w(N, rotation); + assert(poly2_top_bits_are_clear(&b)); + HRSS_poly2_rotr_consttime(&b, rotation); + poly_from_poly2(out, &b); +} + +// poly_invert sets |*out| to |in^-1| (i.e. such that |*out|×|in| = 1 mod Φ(N)). +static void poly_invert(struct poly *out, const struct poly *in) { + // Inversion mod Q, which is done based on the result of inverting mod + // 2. See [NTRUTN14] paper, bottom of page two. + struct poly a, *b, tmp; + + // a = -in. + for (unsigned i = 0; i < N; i++) { + a.v[i] = -in->v[i]; + } + + // b = in^-1 mod 2. + b = out; + poly_invert_mod2(b, in); + + // We are working mod Q=2**13 and we need to iterate ceil(log_2(13)) + // times, which is four. + for (unsigned i = 0; i < 4; i++) { + poly_mul(&tmp, &a, b); + tmp.v[0] += 2; + poly_mul(b, b, &tmp); + } +} + +// Marshal and unmarshal functions for various basic types. +// -------------------------------------------------------- + +#define POLY_BYTES 1138 + +static void poly_marshal(uint8_t out[POLY_BYTES], const struct poly *in) { + const uint16_t *p = in->v; + + for (size_t i = 0; i < N / 8; i++) { + out[0] = p[0]; + out[1] = (0x1f & (p[0] >> 8)) | ((p[1] & 0x07) << 5); + out[2] = p[1] >> 3; + out[3] = (3 & (p[1] >> 11)) | ((p[2] & 0x3f) << 2); + out[4] = (0x7f & (p[2] >> 6)) | ((p[3] & 0x01) << 7); + out[5] = p[3] >> 1; + out[6] = (0xf & (p[3] >> 9)) | ((p[4] & 0x0f) << 4); + out[7] = p[4] >> 4; + out[8] = (1 & (p[4] >> 12)) | ((p[5] & 0x7f) << 1); + out[9] = (0x3f & (p[5] >> 7)) | ((p[6] & 0x03) << 6); + out[10] = p[6] >> 2; + out[11] = (7 & (p[6] >> 10)) | ((p[7] & 0x1f) << 3); + out[12] = p[7] >> 5; + + p += 8; + out += 13; + } + + // There are four remaining values. + out[0] = p[0]; + out[1] = (0x1f & (p[0] >> 8)) | ((p[1] & 0x07) << 5); + out[2] = p[1] >> 3; + out[3] = (3 & (p[1] >> 11)) | ((p[2] & 0x3f) << 2); + out[4] = (0x7f & (p[2] >> 6)) | ((p[3] & 0x01) << 7); + out[5] = p[3] >> 1; + out[6] = 0xf & (p[3] >> 9); +} + +static void poly_unmarshal(struct poly *out, const uint8_t in[POLY_BYTES]) { + uint16_t *p = out->v; + + for (size_t i = 0; i < N / 8; i++) { + p[0] = (uint16_t)(in[0]) | (uint16_t)(in[1] & 0x1f) << 8; + p[1] = (uint16_t)(in[1] >> 5) | (uint16_t)(in[2]) << 3 | + (uint16_t)(in[3] & 3) << 11; + p[2] = (uint16_t)(in[3] >> 2) | (uint16_t)(in[4] & 0x7f) << 6; + p[3] = (uint16_t)(in[4] >> 7) | (uint16_t)(in[5]) << 1 | + (uint16_t)(in[6] & 0xf) << 9; + p[4] = (uint16_t)(in[6] >> 4) | (uint16_t)(in[7]) << 4 | + (uint16_t)(in[8] & 1) << 12; + p[5] = (uint16_t)(in[8] >> 1) | (uint16_t)(in[9] & 0x3f) << 7; + p[6] = (uint16_t)(in[9] >> 6) | (uint16_t)(in[10]) << 2 | + (uint16_t)(in[11] & 7) << 10; + p[7] = (uint16_t)(in[11] >> 3) | (uint16_t)(in[12]) << 5; + + p += 8; + in += 13; + } + + // There are four coefficients remaining. + p[0] = (uint16_t)(in[0]) | (uint16_t)(in[1] & 0x1f) << 8; + p[1] = (uint16_t)(in[1] >> 5) | (uint16_t)(in[2]) << 3 | + (uint16_t)(in[3] & 3) << 11; + p[2] = (uint16_t)(in[3] >> 2) | (uint16_t)(in[4] & 0x7f) << 6; + p[3] = (uint16_t)(in[4] >> 7) | (uint16_t)(in[5]) << 1 | + (uint16_t)(in[6] & 0xf) << 9; + + for (unsigned i = 0; i < N - 1; i++) { + out->v[i] = (int16_t)(out->v[i] << 3) >> 3; + } + + // There are four unused bits at the top of the final byte. They are always + // marshaled as zero by this code but we allow them to take any value when + // parsing in order to support future extension. + + // Set the final coefficient as specifed in [HRSSNIST] 1.9.2 step 6. + uint32_t sum = 0; + for (size_t i = 0; i < N - 1; i++) { + sum += out->v[i]; + } + + out->v[N - 1] = (uint16_t)(0u - sum); +} + +// mod3_from_modQ maps {0, 1, Q-1, 65535} -> {0, 1, 2, 2}. Note that |v| may +// have an invalid value when processing attacker-controlled inputs. +static uint16_t mod3_from_modQ(uint16_t v) { + v &= 3; + return v ^ (v >> 1); +} + +// poly_marshal_mod3 marshals |in| to |out| where the coefficients of |in| are +// all in {0, 1, Q-1, 65535} and |in| is mod Φ(N). (Note that coefficients may +// have invalid values when processing attacker-controlled inputs.) +static void poly_marshal_mod3(uint8_t out[HRSS_POLY3_BYTES], + const struct poly *in) { + const uint16_t *coeffs = in->v; + + // Only 700 coefficients are marshaled because in[700] must be zero. + assert(coeffs[N-1] == 0); + + for (size_t i = 0; i < HRSS_POLY3_BYTES; i++) { + const uint16_t coeffs0 = mod3_from_modQ(coeffs[0]); + const uint16_t coeffs1 = mod3_from_modQ(coeffs[1]); + const uint16_t coeffs2 = mod3_from_modQ(coeffs[2]); + const uint16_t coeffs3 = mod3_from_modQ(coeffs[3]); + const uint16_t coeffs4 = mod3_from_modQ(coeffs[4]); + out[i] = coeffs0 + coeffs1 * 3 + coeffs2 * 9 + coeffs3 * 27 + coeffs4 * 81; + coeffs += 5; + } +} + +// HRSS-specific functions +// ----------------------- + +// poly_short_sample implements the sampling algorithm given in [HRSSNIST] +// section 1.8.1. The output coefficients are in {0, 1, 0xffff} which makes some +// later computation easier. +static void poly_short_sample(struct poly *out, + const uint8_t in[HRSS_SAMPLE_BYTES]) { + // We wish to calculate the difference (mod 3) between two, two-bit numbers. + // Here is a table of results for a - b. Negative one is written as 0b11 so + // that a couple of shifts can be used to sign-extend it. Any input value of + // 0b11 is invalid and a convention is adopted that an invalid input results + // in an invalid output (0b10). + // + // b a result + // 00 00 00 + // 00 01 01 + // 00 10 11 + // 00 11 10 + // 01 00 11 + // 01 01 00 + // 01 10 01 + // 01 11 10 + // 10 00 01 + // 10 01 11 + // 10 10 00 + // 10 11 10 + // 11 00 10 + // 11 01 10 + // 11 10 10 + // 11 11 10 + // + // The result column is encoded in a single-word lookup-table: + // 0001 1110 1100 0110 0111 0010 1010 1010 + // 1 d c 6 7 2 a a + static const uint32_t kLookup = 0x1dc672aa; + + // In order to generate pairs of numbers mod 3 (non-uniformly) we treat pairs + // of bits in a uint32 as separate values and sum two random vectors of 1-bit + // numbers. This works because these pairs are isolated because no carry can + // spread between them. + + uint16_t *p = out->v; + for (size_t i = 0; i < N / 8; i++) { + uint32_t v; + OPENSSL_memcpy(&v, in, sizeof(v)); + in += sizeof(v); + + uint32_t sums = (v & 0x55555555) + ((v >> 1) & 0x55555555); + for (unsigned j = 0; j < 8; j++) { + p[j] = (int32_t)(kLookup << ((sums & 15) << 1)) >> 30; + sums >>= 4; + } + p += 8; + } + + // There are four values remaining. + uint16_t v; + OPENSSL_memcpy(&v, in, sizeof(v)); + + uint16_t sums = (v & 0x5555) + ((v >> 1) & 0x5555); + for (unsigned j = 0; j < 4; j++) { + p[j] = (int32_t)(kLookup << ((sums & 15) << 1)) >> 30; + sums >>= 4; + } + + out->v[N - 1] = 0; +} + +// poly_short_sample_plus performs the T+ sample as defined in [HRSSNIST], +// section 1.8.2. +static void poly_short_sample_plus(struct poly *out, + const uint8_t in[HRSS_SAMPLE_BYTES]) { + poly_short_sample(out, in); + + // sum (and the product in the for loop) will overflow. But that's fine + // because |sum| is bound by +/- (N-2), and N < 2^15 so it works out. + uint16_t sum = 0; + for (unsigned i = 0; i < N - 2; i++) { + sum += (unsigned) out->v[i] * out->v[i + 1]; + } + + // If the sum is negative, flip the sign of even-positioned coefficients. (See + // page 8 of [HRSS].) + sum = ((int16_t) sum) >> 15; + const uint16_t scale = sum | (~sum & 1); + for (unsigned i = 0; i < N; i += 2) { + out->v[i] = (unsigned) out->v[i] * scale; + } +} + +// poly_lift computes the function discussed in [HRSS], appendix B. +static void poly_lift(struct poly *out, const struct poly *a) { + // We wish to calculate a/(𝑥-1) mod Φ(N) over GF(3), where Φ(N) is the + // Nth cyclotomic polynomial, i.e. 1 + 𝑥 + … + 𝑥^700 (since N is prime). + + // 1/(𝑥-1) has a fairly basic structure that we can exploit to speed this up: + // + // R. = PolynomialRing(GF(3)…) + // inv = R.cyclotomic_polynomial(1).inverse_mod(R.cyclotomic_polynomial(n)) + // list(inv)[:15] + // [1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2] + // + // This three-element pattern of coefficients repeats for the whole + // polynomial. + // + // Next define the overbar operator such that z̅ = z[0] + + // reverse(z[1:]). (Index zero of a polynomial here is the coefficient + // of the constant term. So index one is the coefficient of 𝑥 and so + // on.) + // + // A less odd way to define this is to see that z̅ negates the indexes, + // so z̅[0] = z[-0], z̅[1] = z[-1] and so on. + // + // The use of z̅ is that, when working mod (𝑥^701 - 1), vz[0] = , vz[1] = , …. (Where is the inner product: the sum + // of the point-wise products.) Although we calculated the inverse mod + // Φ(N), we can work mod (𝑥^N - 1) and reduce mod Φ(N) at the end. + // (That's because (𝑥^N - 1) is a multiple of Φ(N).) + // + // When working mod (𝑥^N - 1), multiplication by 𝑥 is a right-rotation + // of the list of coefficients. + // + // Thus we can consider what the pattern of z̅, 𝑥z̅, 𝑥^2z̅, … looks like: + // + // def reverse(xs): + // suffix = list(xs[1:]) + // suffix.reverse() + // return [xs[0]] + suffix + // + // def rotate(xs): + // return [xs[-1]] + xs[:-1] + // + // zoverbar = reverse(list(inv) + [0]) + // xzoverbar = rotate(reverse(list(inv) + [0])) + // x2zoverbar = rotate(rotate(reverse(list(inv) + [0]))) + // + // zoverbar[:15] + // [1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1] + // xzoverbar[:15] + // [0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0] + // x2zoverbar[:15] + // [2, 0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2] + // + // (For a formula for z̅, see lemma two of appendix B.) + // + // After the first three elements have been taken care of, all then have + // a repeating three-element cycle. The next value (𝑥^3z̅) involves + // three rotations of the first pattern, thus the three-element cycle + // lines up. However, the discontinuity in the first three elements + // obviously moves to a different position. Consider the difference + // between 𝑥^3z̅ and z̅: + // + // [x-y for (x,y) in zip(zoverbar, x3zoverbar)][:15] + // [0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + // + // This pattern of differences is the same for all elements, although it + // obviously moves right with the rotations. + // + // From this, we reach algorithm eight of appendix B. + + // Handle the first three elements of the inner products. + out->v[0] = a->v[0] + a->v[2]; + out->v[1] = a->v[1]; + out->v[2] = -a->v[0] + a->v[2]; + + // s0, s1, s2 are added into out->v[0], out->v[1], and out->v[2], + // respectively. We do not compute s1 because it's just -(s0 + s1). + uint16_t s0 = 0, s2 = 0; + for (size_t i = 3; i < 699; i += 3) { + s0 += -a->v[i] + a->v[i + 2]; + // s1 += a->v[i] - a->v[i + 1]; + s2 += a->v[i + 1] - a->v[i + 2]; + } + + // Handle the fact that the three-element pattern doesn't fill the + // polynomial exactly (since 701 isn't a multiple of three). + s0 -= a->v[699]; + // s1 += a->v[699] - a->v[700]; + s2 += a->v[700]; + + // Note that s0 + s1 + s2 = 0. + out->v[0] += s0; + out->v[1] -= (s0 + s2); // = s1 + out->v[2] += s2; + + // Calculate the remaining inner products by taking advantage of the + // fact that the pattern repeats every three cycles and the pattern of + // differences moves with the rotation. + for (size_t i = 3; i < N; i++) { + out->v[i] = (out->v[i - 3] - (a->v[i - 2] + a->v[i - 1] + a->v[i])); + } + + // Reduce mod Φ(N) by subtracting a multiple of out[700] from every + // element and convert to mod Q. (See above about adding twice as + // subtraction.) + const crypto_word_t v = out->v[700]; + for (unsigned i = 0; i < N; i++) { + const uint16_t vi_mod3 = mod3(out->v[i] - v); + // Map {0, 1, 2} to {0, 1, 0xffff}. + out->v[i] = (~((vi_mod3 >> 1) - 1)) | vi_mod3; + } + + poly_mul_x_minus_1(out); +} + +struct public_key { + struct poly ph; +}; + +struct private_key { + struct poly3 f, f_inverse; + struct poly ph_inverse; + uint8_t hmac_key[32]; +}; + +// public_key_from_external converts an external public key pointer into an +// internal one. Externally the alignment is only specified to be eight bytes +// but we need 16-byte alignment. We could annotate the external struct with +// that alignment but we can only assume that malloced pointers are 8-byte +// aligned in any case. (Even if the underlying malloc returns values with +// 16-byte alignment, |OPENSSL_malloc| will store an 8-byte size prefix and mess +// that up.) +static struct public_key *public_key_from_external( + struct HRSS_public_key *ext) { + OPENSSL_STATIC_ASSERT( + sizeof(struct HRSS_public_key) >= sizeof(struct public_key) + 15, + "HRSS public key too small"); + + uintptr_t p = (uintptr_t)ext; + p = (p + 15) & ~15; + return (struct public_key *)p; +} + +// private_key_from_external does the same thing as |public_key_from_external|, +// but for private keys. See the comment on that function about alignment +// issues. +static struct private_key *private_key_from_external( + struct HRSS_private_key *ext) { + OPENSSL_STATIC_ASSERT( + sizeof(struct HRSS_private_key) >= sizeof(struct private_key) + 15, + "HRSS private key too small"); + + uintptr_t p = (uintptr_t)ext; + p = (p + 15) & ~15; + return (struct private_key *)p; +} + +void HRSS_generate_key( + struct HRSS_public_key *out_pub, struct HRSS_private_key *out_priv, + const uint8_t in[HRSS_SAMPLE_BYTES + HRSS_SAMPLE_BYTES + 32]) { + struct public_key *pub = public_key_from_external(out_pub); + struct private_key *priv = private_key_from_external(out_priv); + + OPENSSL_memcpy(priv->hmac_key, in + 2 * HRSS_SAMPLE_BYTES, + sizeof(priv->hmac_key)); + + struct poly f; + poly_short_sample_plus(&f, in); + poly3_from_poly(&priv->f, &f); + HRSS_poly3_invert(&priv->f_inverse, &priv->f); + + // pg_phi1 is p (i.e. 3) × g × Φ(1) (i.e. 𝑥-1). + struct poly pg_phi1; + poly_short_sample_plus(&pg_phi1, in + HRSS_SAMPLE_BYTES); + for (unsigned i = 0; i < N; i++) { + pg_phi1.v[i] *= 3; + } + poly_mul_x_minus_1(&pg_phi1); + + struct poly pfg_phi1; + poly_mul(&pfg_phi1, &f, &pg_phi1); + + struct poly pfg_phi1_inverse; + poly_invert(&pfg_phi1_inverse, &pfg_phi1); + + poly_mul(&pub->ph, &pfg_phi1_inverse, &pg_phi1); + poly_mul(&pub->ph, &pub->ph, &pg_phi1); + poly_clamp(&pub->ph); + + poly_mul(&priv->ph_inverse, &pfg_phi1_inverse, &f); + poly_mul(&priv->ph_inverse, &priv->ph_inverse, &f); + poly_clamp(&priv->ph_inverse); +} + +static void owf(uint8_t out[POLY_BYTES], const struct public_key *pub, + const struct poly *m_lifted, const struct poly *r) { + struct poly prh_plus_m; + poly_mul(&prh_plus_m, r, &pub->ph); + for (unsigned i = 0; i < N; i++) { + prh_plus_m.v[i] += m_lifted->v[i]; + } + + poly_marshal(out, &prh_plus_m); +} + +static const char kConfirmationHash[] = "confirmation hash"; +static const char kSharedKey[] = "shared key"; + +void HRSS_encap(uint8_t out_ciphertext[POLY_BYTES + 32], + uint8_t out_shared_key[32], + const struct HRSS_public_key *in_pub, + const uint8_t in[HRSS_SAMPLE_BYTES + HRSS_SAMPLE_BYTES]) { + const struct public_key *pub = + public_key_from_external((struct HRSS_public_key *)in_pub); + struct poly m, r, m_lifted; + poly_short_sample(&m, in); + poly_short_sample(&r, in + HRSS_SAMPLE_BYTES); + poly_lift(&m_lifted, &m); + owf(out_ciphertext, pub, &m_lifted, &r); + + uint8_t m_bytes[HRSS_POLY3_BYTES], r_bytes[HRSS_POLY3_BYTES]; + poly_marshal_mod3(m_bytes, &m); + poly_marshal_mod3(r_bytes, &r); + + SHA256_CTX hash_ctx; + SHA256_Init(&hash_ctx); + SHA256_Update(&hash_ctx, kConfirmationHash, sizeof(kConfirmationHash)); + SHA256_Update(&hash_ctx, m_bytes, sizeof(m_bytes)); + SHA256_Update(&hash_ctx, r_bytes, sizeof(r_bytes)); + SHA256_Final(out_ciphertext + POLY_BYTES, &hash_ctx); + + SHA256_Init(&hash_ctx); + SHA256_Update(&hash_ctx, kSharedKey, sizeof(kSharedKey)); + SHA256_Update(&hash_ctx, m_bytes, sizeof(m_bytes)); + SHA256_Update(&hash_ctx, r_bytes, sizeof(r_bytes)); + SHA256_Update(&hash_ctx, out_ciphertext, POLY_BYTES + 32); + SHA256_Final(out_shared_key, &hash_ctx); +} + +void HRSS_decap(uint8_t out_shared_key[HRSS_KEY_BYTES], + const struct HRSS_public_key *in_pub, + const struct HRSS_private_key *in_priv, + const uint8_t *ciphertext, size_t ciphertext_len) { + const struct public_key *pub = + public_key_from_external((struct HRSS_public_key *)in_pub); + const struct private_key *priv = + private_key_from_external((struct HRSS_private_key *)in_priv); + + // This is HMAC, expanded inline rather than using the |HMAC| function so that + // we can avoid dealing with possible allocation failures and so keep this + // function infallible. + uint8_t masked_key[SHA256_CBLOCK]; + OPENSSL_STATIC_ASSERT(sizeof(priv->hmac_key) <= sizeof(masked_key), + "HRSS HMAC key larger than SHA-256 block size"); + for (size_t i = 0; i < sizeof(priv->hmac_key); i++) { + masked_key[i] = priv->hmac_key[i] ^ 0x36; + } + OPENSSL_memset(masked_key + sizeof(priv->hmac_key), 0x36, + sizeof(masked_key) - sizeof(priv->hmac_key)); + + SHA256_CTX hash_ctx; + SHA256_Init(&hash_ctx); + SHA256_Update(&hash_ctx, masked_key, sizeof(masked_key)); + SHA256_Update(&hash_ctx, ciphertext, ciphertext_len); + uint8_t inner_digest[SHA256_DIGEST_LENGTH]; + SHA256_Final(inner_digest, &hash_ctx); + + for (size_t i = 0; i < sizeof(priv->hmac_key); i++) { + masked_key[i] ^= (0x5c ^ 0x36); + } + OPENSSL_memset(masked_key + sizeof(priv->hmac_key), 0x5c, + sizeof(masked_key) - sizeof(priv->hmac_key)); + + SHA256_Init(&hash_ctx); + SHA256_Update(&hash_ctx, masked_key, sizeof(masked_key)); + SHA256_Update(&hash_ctx, inner_digest, sizeof(inner_digest)); + OPENSSL_STATIC_ASSERT(HRSS_KEY_BYTES == SHA256_DIGEST_LENGTH, + "HRSS shared key length incorrect"); + SHA256_Final(out_shared_key, &hash_ctx); + + // If the ciphertext is publicly invalid then a random shared key is still + // returned to simply the logic of the caller, but this path is not constant + // time. + if (ciphertext_len != POLY_BYTES + 32) { + return; + } + + struct poly c; + poly_unmarshal(&c, ciphertext); + + struct poly f; + poly_from_poly3(&f, &priv->f); + + struct poly cf; + poly_mul(&cf, &c, &f); + + struct poly3 cf3; + poly3_from_poly(&cf3, &cf); + // Note that cf3 is not reduced mod Φ(N). That reduction is deferred. + + struct poly3 m3; + HRSS_poly3_mul(&m3, &cf3, &priv->f_inverse); + + struct poly m, m_lifted; + poly_from_poly3(&m, &m3); + poly_lift(&m_lifted, &m); + + for (unsigned i = 0; i < N; i++) { + c.v[i] -= m_lifted.v[i]; + } + poly_mul(&c, &c, &priv->ph_inverse); + poly_mod_phiN(&c); + poly_clamp(&c); + + struct poly3 r3; + crypto_word_t ok = poly3_from_poly_checked(&r3, &c); + + uint8_t expected_ciphertext[POLY_BYTES + 32]; + assert(ciphertext_len == sizeof(expected_ciphertext)); + owf(expected_ciphertext, pub, &m_lifted, &c); + + uint8_t m_bytes[HRSS_POLY3_BYTES]; + uint8_t r_bytes[HRSS_POLY3_BYTES]; + poly_marshal_mod3(m_bytes, &m); + poly_marshal_mod3(r_bytes, &c); + + SHA256_Init(&hash_ctx); + SHA256_Update(&hash_ctx, kConfirmationHash, sizeof(kConfirmationHash)); + SHA256_Update(&hash_ctx, m_bytes, sizeof(m_bytes)); + SHA256_Update(&hash_ctx, r_bytes, sizeof(r_bytes)); + SHA256_Final(expected_ciphertext + POLY_BYTES, &hash_ctx); + + ok &= constant_time_is_zero_w(CRYPTO_memcmp(ciphertext, expected_ciphertext, + sizeof(expected_ciphertext))); + + uint8_t shared_key[32]; + SHA256_Init(&hash_ctx); + SHA256_Update(&hash_ctx, kSharedKey, sizeof(kSharedKey)); + SHA256_Update(&hash_ctx, m_bytes, sizeof(m_bytes)); + SHA256_Update(&hash_ctx, r_bytes, sizeof(r_bytes)); + SHA256_Update(&hash_ctx, expected_ciphertext, sizeof(expected_ciphertext)); + SHA256_Final(shared_key, &hash_ctx); + + for (unsigned i = 0; i < sizeof(shared_key); i++) { + out_shared_key[i] = + constant_time_select_8(ok, shared_key[i], out_shared_key[i]); + } +} + +void HRSS_marshal_public_key(uint8_t out[HRSS_PUBLIC_KEY_BYTES], + const struct HRSS_public_key *in_pub) { + const struct public_key *pub = + public_key_from_external((struct HRSS_public_key *)in_pub); + poly_marshal(out, &pub->ph); +} + +int HRSS_parse_public_key(struct HRSS_public_key *out, + const uint8_t in[HRSS_PUBLIC_KEY_BYTES]) { + struct public_key *pub = public_key_from_external(out); + poly_unmarshal(&pub->ph, in); + OPENSSL_memset(&pub->ph.v[N], 0, 3 * sizeof(uint16_t)); + return 1; +} diff --git a/crypto/hrss/hrss_test.cc b/crypto/hrss/hrss_test.cc new file mode 100644 index 00000000..d23e68e5 --- /dev/null +++ b/crypto/hrss/hrss_test.cc @@ -0,0 +1,475 @@ +/* Copyright (c) 2018, Google Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#include + +#include +#include + +#include "../test/test_util.h" +#include "internal.h" + +// poly2_from_bits takes the least-significant bit from each byte of |in| and +// sets the bits of |*out| to match. +static void poly2_from_bits(struct poly2 *out, const uint8_t in[N]) { + crypto_word_t *words = out->v; + unsigned shift = 0; + crypto_word_t word = 0; + + for (unsigned i = 0; i < N; i++) { + word >>= 1; + word |= (crypto_word_t)(in[i] & 1) << (BITS_PER_WORD - 1); + shift++; + + if (shift == BITS_PER_WORD) { + *words = word; + words++; + word = 0; + shift = 0; + } + } + + word >>= BITS_PER_WORD - shift; + *words = word; +} + +TEST(HRSS, Poly2RotateRight) { + uint8_t bits[N]; + RAND_bytes(bits, sizeof(bits)); + for (size_t i = 0; i < N; i++) { + bits[i] &= 1; + }; + + struct poly2 p, orig, shifted; + poly2_from_bits(&p, bits); + OPENSSL_memcpy(&orig, &p, sizeof(orig)); + + // Test |HRSS_poly2_rotr_consttime| by manually rotating |bits| step-by-step + // and testing every possible shift to ensure that it produces the correct + // answer. + for (size_t shift = 0; shift <= N; shift++) { + SCOPED_TRACE(shift); + + OPENSSL_memcpy(&p, &orig, sizeof(orig)); + HRSS_poly2_rotr_consttime(&p, shift); + poly2_from_bits(&shifted, bits); + ASSERT_EQ( + Bytes(reinterpret_cast(&shifted), sizeof(shifted)), + Bytes(reinterpret_cast(&p), sizeof(p))); + + const uint8_t least_significant_bit = bits[0]; + OPENSSL_memmove(bits, &bits[1], N-1); + bits[N-1] = least_significant_bit; + } +} + +// poly3_rand sets |r| to a random value (albeit with bias). +static void poly3_rand(poly3 *p) { + RAND_bytes(reinterpret_cast(p), sizeof(p)); + p->s.v[WORDS_PER_POLY - 1] &= (UINT64_C(1) << BITS_IN_LAST_WORD) - 1; + p->a.v[WORDS_PER_POLY - 1] &= (UINT64_C(1) << BITS_IN_LAST_WORD) - 1; + // (s, a) = (1, 1) is invalid. Map those to one. + for (size_t j = 0; j < WORDS_PER_POLY; j++) { + p->s.v[j] ^= p->s.v[j] & p->a.v[j]; + } +} + +// poly3_word_add sets (|s1|, |a1|) += (|s2|, |a2|). +static void poly3_word_add(crypto_word_t *s1, crypto_word_t *a1, + const crypto_word_t s2, const crypto_word_t a2) { + const crypto_word_t x = *a1 ^ a2; + const crypto_word_t y = (*s1 ^ s2) ^ (*a1 & a2); + const crypto_word_t z = *s1 & s2; + *s1 = y & ~x; + *a1 = z | (x & ~y); +} + +TEST(HRSS, Poly3Invert) { + poly3 p, inverse, result; + memset(&p, 0, sizeof(p)); + memset(&inverse, 0, sizeof(inverse)); + memset(&result, 0, sizeof(result)); + + // The inverse of -1 is -1. + p.s.v[0] = 1; + HRSS_poly3_invert(&inverse, &p); + EXPECT_EQ(Bytes(reinterpret_cast(&p), sizeof(p)), + Bytes(reinterpret_cast(&inverse), sizeof(inverse))); + + // The inverse of 1 is 1. + p.s.v[0] = 0; + p.a.v[0] = 1; + HRSS_poly3_invert(&inverse, &p); + EXPECT_EQ(Bytes(reinterpret_cast(&p), sizeof(p)), + Bytes(reinterpret_cast(&inverse), sizeof(inverse))); + + for (size_t i = 0; i < 500; i++) { + poly3 r; + poly3_rand(&r); + HRSS_poly3_invert(&inverse, &r); + HRSS_poly3_mul(&result, &inverse, &r); + // r×r⁻¹ = 1, and |p| contains 1. + EXPECT_EQ( + Bytes(reinterpret_cast(&p), sizeof(p)), + Bytes(reinterpret_cast(&result), sizeof(result))); + } +} + +TEST(HRSS, Poly3UnreducedInput) { + // Check that |poly3_mul| works correctly with inputs that aren't reduced mod + // Φ(N). + poly3 r, inverse, result, one; + poly3_rand(&r); + HRSS_poly3_invert(&inverse, &r); + HRSS_poly3_mul(&result, &inverse, &r); + + memset(&one, 0, sizeof(one)); + one.a.v[0] = 1; + EXPECT_EQ(Bytes(reinterpret_cast(&one), sizeof(one)), + Bytes(reinterpret_cast(&result), sizeof(result))); + + // |r| is probably already not reduced mod Φ(N), but add x^701 - 1 and + // recompute to ensure that we get the same answer. (Since (x^701 - 1) ≡ 0 mod + // Φ(N).) + poly3_word_add(&r.s.v[0], &r.a.v[0], 1, 0); + poly3_word_add(&r.s.v[WORDS_PER_POLY - 1], &r.a.v[WORDS_PER_POLY - 1], 0, + UINT64_C(1) << BITS_IN_LAST_WORD); + + HRSS_poly3_mul(&result, &inverse, &r); + EXPECT_EQ(Bytes(reinterpret_cast(&one), sizeof(one)), + Bytes(reinterpret_cast(&result), sizeof(result))); + + // Check that x^700 × 1 gives -x^699 - x^698 … -1. + poly3 x700; + memset(&x700, 0, sizeof(x700)); + x700.a.v[WORDS_PER_POLY-1] = UINT64_C(1) << (BITS_IN_LAST_WORD - 1); + HRSS_poly3_mul(&result, &one, &x700); + + for (size_t i = 0; i < WORDS_PER_POLY-1; i++) { + EXPECT_EQ(CONSTTIME_TRUE_W, result.s.v[i]); + EXPECT_EQ(0u, result.a.v[i]); + } + EXPECT_EQ((UINT64_C(1) << (BITS_IN_LAST_WORD - 1)) - 1, + result.s.v[WORDS_PER_POLY - 1]); + EXPECT_EQ(0u, result.a.v[WORDS_PER_POLY - 1]); +} + +TEST(HRSS, Basic) { + uint8_t generate_key_entropy[HRSS_GENERATE_KEY_BYTES]; + for (unsigned i = 0; i < sizeof(generate_key_entropy); i++) { + generate_key_entropy[i] = i; + } + + HRSS_public_key pub; + HRSS_private_key priv; + HRSS_generate_key(&pub, &priv, generate_key_entropy); + + uint8_t encap_entropy[HRSS_ENCAP_BYTES]; + for (unsigned i = 0; i < sizeof(encap_entropy); i++) { + encap_entropy[i] = i; + } + + uint8_t ciphertext[HRSS_CIPHERTEXT_BYTES]; + uint8_t shared_key[HRSS_KEY_BYTES]; + HRSS_encap(ciphertext, shared_key, &pub, encap_entropy); + + HRSS_public_key pub2; + uint8_t pub_bytes[HRSS_PUBLIC_KEY_BYTES]; + HRSS_marshal_public_key(pub_bytes, &pub); + ASSERT_TRUE(HRSS_parse_public_key(&pub2, pub_bytes)); + + uint8_t shared_key2[HRSS_KEY_BYTES]; + HRSS_decap(shared_key2, &pub2, &priv, ciphertext, sizeof(ciphertext)); + + EXPECT_EQ(Bytes(shared_key), Bytes(shared_key2)); +} + +TEST(HRSS, Random) { + for (unsigned i = 0; i < 10; i++) { + uint8_t generate_key_entropy[HRSS_GENERATE_KEY_BYTES]; + RAND_bytes(generate_key_entropy, sizeof(generate_key_entropy)); + SCOPED_TRACE(Bytes(generate_key_entropy)); + + HRSS_public_key pub; + HRSS_private_key priv; + HRSS_generate_key(&pub, &priv, generate_key_entropy); + + for (unsigned j = 0; j < 10; j++) { + uint8_t encap_entropy[HRSS_ENCAP_BYTES]; + RAND_bytes(encap_entropy, sizeof(encap_entropy)); + SCOPED_TRACE(Bytes(generate_key_entropy)); + + uint8_t ciphertext[HRSS_CIPHERTEXT_BYTES]; + uint8_t shared_key[HRSS_KEY_BYTES]; + HRSS_encap(ciphertext, shared_key, &pub, encap_entropy); + + uint8_t shared_key2[HRSS_KEY_BYTES]; + HRSS_decap(shared_key2, &pub, &priv, ciphertext, sizeof(ciphertext)); + + EXPECT_EQ(Bytes(shared_key), Bytes(shared_key2)); + } + } +} + +TEST(HRSS, Golden) { + uint8_t generate_key_entropy[HRSS_GENERATE_KEY_BYTES]; + for (unsigned i = 0; i < HRSS_SAMPLE_BYTES; i++) { + generate_key_entropy[i] = i; + } + for (unsigned i = HRSS_SAMPLE_BYTES; i < 2 * HRSS_SAMPLE_BYTES; i++) { + generate_key_entropy[i] = 2 + i; + } + for (unsigned i = 2 * HRSS_SAMPLE_BYTES; i < sizeof(generate_key_entropy); + i++) { + generate_key_entropy[i] = 4 + i; + } + + HRSS_public_key pub; + HRSS_private_key priv; + OPENSSL_memset(&pub, 0, sizeof(pub)); + OPENSSL_memset(&priv, 0, sizeof(priv)); + HRSS_generate_key(&pub, &priv, generate_key_entropy); + + static const uint8_t kExpectedPub[HRSS_PUBLIC_KEY_BYTES] = { + 0xf8, 0x9f, 0xa0, 0xfc, 0xf1, 0xd4, 0xfa, 0x4d, 0x8f, 0x35, 0x28, 0x73, + 0x0e, 0x37, 0x18, 0x1d, 0x09, 0xf3, 0x9e, 0x16, 0x0d, 0x7f, 0x9c, 0x82, + 0x17, 0xa1, 0xa1, 0x88, 0x6b, 0x29, 0x5b, 0x3a, 0x30, 0xcd, 0x6f, 0x8e, + 0x0c, 0xd3, 0x38, 0x0c, 0x05, 0x68, 0x6e, 0x4c, 0xcc, 0x20, 0xd4, 0x06, + 0x77, 0x0c, 0xac, 0x1c, 0x49, 0x14, 0x00, 0xd6, 0x9b, 0x1c, 0xde, 0x43, + 0x0a, 0x59, 0x37, 0xd6, 0x46, 0x68, 0x1f, 0x04, 0xcb, 0x73, 0x92, 0x37, + 0x2d, 0x7f, 0x57, 0x70, 0x16, 0xe8, 0x06, 0x48, 0x3b, 0x66, 0xb3, 0x63, + 0x02, 0x5a, 0x71, 0x46, 0xdd, 0xa4, 0xee, 0xb8, 0x78, 0x44, 0xfd, 0x9e, + 0xd0, 0x71, 0x16, 0x00, 0xbd, 0x01, 0x1e, 0x27, 0x2e, 0xa0, 0xc6, 0x8d, + 0x55, 0x89, 0x7c, 0x2a, 0x01, 0x2b, 0x1b, 0x75, 0xa2, 0xc2, 0xd1, 0x5a, + 0x67, 0xfa, 0xdd, 0x3b, 0x70, 0x9d, 0xdb, 0xcd, 0x73, 0x32, 0x5e, 0x24, + 0xb1, 0xcf, 0x23, 0xbe, 0x3c, 0x56, 0xcc, 0xbe, 0x61, 0xdb, 0xe7, 0x3c, + 0xc7, 0xf5, 0x09, 0xe6, 0x87, 0xa0, 0x09, 0x52, 0x9d, 0x61, 0x5b, 0xc6, + 0xd4, 0xc5, 0x2e, 0xc2, 0x6c, 0x87, 0x30, 0x36, 0x49, 0x6f, 0x04, 0xaa, + 0xb3, 0x26, 0xd5, 0x63, 0xcf, 0xd4, 0x74, 0x1e, 0xc7, 0x79, 0xb3, 0xfc, + 0x8c, 0x41, 0x36, 0x79, 0xaa, 0xd5, 0xba, 0x64, 0x49, 0x48, 0xdb, 0xeb, + 0xe8, 0x33, 0x7d, 0xbe, 0x3b, 0x67, 0xd7, 0xfd, 0x93, 0x1e, 0x80, 0x8d, + 0x17, 0xab, 0x6f, 0xfd, 0x1c, 0x4b, 0x2d, 0x5b, 0x90, 0xf0, 0xf0, 0x5d, + 0xbe, 0x8f, 0x81, 0x18, 0x29, 0x08, 0x9a, 0x47, 0x1b, 0xc2, 0x2d, 0xa2, + 0x22, 0x5a, 0x4f, 0xe9, 0x81, 0x64, 0xdd, 0x53, 0x2e, 0x67, 0xe5, 0x07, + 0x1a, 0xf0, 0x0c, 0x54, 0x9b, 0xe2, 0xf8, 0xe6, 0xb3, 0xb6, 0xe0, 0x5a, + 0x74, 0xfa, 0x8d, 0x9c, 0xa5, 0x7c, 0x6e, 0x73, 0xba, 0xee, 0x6e, 0x6e, + 0x31, 0xcb, 0x59, 0xd7, 0xfd, 0x94, 0x1c, 0x4d, 0x62, 0xc6, 0x87, 0x0b, + 0x38, 0x54, 0xc6, 0x35, 0xac, 0xc8, 0x8c, 0xc0, 0xd9, 0x99, 0xee, 0xfc, + 0xa9, 0xde, 0xc4, 0x50, 0x88, 0x8e, 0x24, 0xf6, 0xd6, 0x04, 0x54, 0x3e, + 0x81, 0xc4, 0x96, 0x9a, 0x40, 0xe5, 0xef, 0x8b, 0xec, 0x41, 0x50, 0x1d, + 0x14, 0xae, 0xa4, 0x5a, 0xac, 0xd4, 0x73, 0x31, 0xc3, 0x1d, 0xc1, 0x96, + 0x89, 0xd8, 0x62, 0x97, 0x60, 0x3f, 0x58, 0x2a, 0x5f, 0xcf, 0xcb, 0x26, + 0x99, 0x69, 0x81, 0x13, 0x9c, 0xaf, 0x17, 0x91, 0xa8, 0xeb, 0x9a, 0xf9, + 0xd3, 0x83, 0x47, 0x66, 0xc7, 0xf8, 0xd8, 0xe3, 0xd2, 0x7e, 0x58, 0xa9, + 0xf5, 0xb2, 0x03, 0xbe, 0x7e, 0xa5, 0x29, 0x9d, 0xff, 0xd1, 0xd8, 0x55, + 0x39, 0xc7, 0x2c, 0xce, 0x03, 0x64, 0xdc, 0x18, 0xe7, 0xb0, 0x60, 0x46, + 0x26, 0xeb, 0xb7, 0x61, 0x4b, 0x91, 0x2c, 0xd8, 0xa2, 0xee, 0x63, 0x2e, + 0x15, 0x0a, 0x58, 0x88, 0x04, 0xb1, 0xed, 0x6d, 0xf1, 0x5c, 0xc7, 0xee, + 0x60, 0x38, 0x26, 0xc9, 0x31, 0x7e, 0x69, 0xe4, 0xac, 0x3c, 0x72, 0x09, + 0x3e, 0xe6, 0x24, 0x30, 0x44, 0x6e, 0x66, 0x83, 0xb9, 0x2a, 0x22, 0xaf, + 0x26, 0x1e, 0xaa, 0xa3, 0xf4, 0xb1, 0xa1, 0x5c, 0xfa, 0x5f, 0x0d, 0x71, + 0xac, 0xe3, 0xe0, 0xc3, 0xdd, 0x4f, 0x96, 0x57, 0x8b, 0x58, 0xac, 0xe3, + 0x42, 0x8e, 0x47, 0x72, 0xb1, 0xe4, 0x19, 0x68, 0x3e, 0xbb, 0x19, 0x14, + 0xdf, 0x16, 0xb5, 0xde, 0x7f, 0x37, 0xaf, 0xd8, 0xd3, 0x3d, 0x6a, 0x16, + 0x1b, 0x26, 0xd3, 0xcc, 0x53, 0x82, 0x57, 0x90, 0x89, 0xc5, 0x7e, 0x6d, + 0x7e, 0x99, 0x5b, 0xcd, 0xd3, 0x18, 0xbb, 0x89, 0xef, 0x76, 0xbd, 0xd2, + 0x62, 0xf0, 0xe8, 0x25, 0x2a, 0x8d, 0xe2, 0x21, 0xea, 0xde, 0x6e, 0xa5, + 0xa4, 0x3d, 0x58, 0xee, 0xdf, 0x90, 0xc1, 0xa1, 0x38, 0x5d, 0x11, 0x50, + 0xb5, 0xac, 0x9d, 0xb4, 0xfd, 0xef, 0x53, 0xe8, 0xc0, 0x17, 0x6c, 0x4f, + 0x31, 0xe0, 0xcc, 0x8f, 0x80, 0x7a, 0x84, 0x14, 0xde, 0xee, 0xec, 0xdd, + 0x6a, 0xad, 0x29, 0x65, 0xa5, 0x72, 0xc3, 0x73, 0x5f, 0xe3, 0x6f, 0x60, + 0xb1, 0xfb, 0x0f, 0xaa, 0xc6, 0xda, 0x53, 0x4a, 0xb1, 0x92, 0x2a, 0xb7, + 0x02, 0xbe, 0xf9, 0xdf, 0x37, 0x16, 0xe7, 0x5c, 0x38, 0x0b, 0x3c, 0xe2, + 0xdd, 0x90, 0xb8, 0x7b, 0x48, 0x69, 0x79, 0x81, 0xc5, 0xae, 0x9a, 0x0d, + 0x78, 0x95, 0x52, 0x63, 0x80, 0xda, 0x46, 0x69, 0x20, 0x57, 0x9b, 0x27, + 0xe2, 0xe8, 0xbd, 0x2f, 0x45, 0xe6, 0x46, 0x40, 0xae, 0x50, 0xd5, 0xa2, + 0x53, 0x93, 0xe1, 0x99, 0xfd, 0x13, 0x7c, 0xf6, 0x22, 0xc4, 0x6c, 0xab, + 0xe3, 0xc9, 0x55, 0x0a, 0x16, 0x67, 0x68, 0x26, 0x6b, 0xd6, 0x7d, 0xde, + 0xd3, 0xae, 0x71, 0x32, 0x02, 0xf1, 0x27, 0x67, 0x47, 0x74, 0xd9, 0x40, + 0x35, 0x1d, 0x25, 0x72, 0x32, 0xdf, 0x75, 0xd5, 0x60, 0x26, 0xab, 0x90, + 0xfa, 0xeb, 0x26, 0x11, 0x4b, 0xb4, 0xc5, 0xc2, 0x3e, 0xa9, 0x23, 0x3a, + 0x4e, 0x6a, 0xb1, 0xbb, 0xb3, 0xea, 0xf9, 0x1e, 0xe4, 0x10, 0xf5, 0xdc, + 0x35, 0xde, 0xb5, 0xee, 0xf0, 0xde, 0xa1, 0x18, 0x80, 0xc7, 0x13, 0x68, + 0x46, 0x94, 0x0e, 0x2a, 0x8e, 0xf8, 0xe9, 0x26, 0x84, 0x42, 0x0f, 0x56, + 0xed, 0x67, 0x7f, 0xeb, 0x7d, 0x35, 0x07, 0x01, 0x11, 0x81, 0x8b, 0x56, + 0x88, 0xc6, 0x58, 0x61, 0x65, 0x3c, 0x5d, 0x9c, 0x58, 0x25, 0xd6, 0xdf, + 0x4e, 0x3b, 0x93, 0xbf, 0x82, 0xe1, 0x19, 0xb8, 0xda, 0xde, 0x26, 0x38, + 0xf2, 0xd9, 0x95, 0x24, 0x98, 0xde, 0x58, 0xf7, 0x0c, 0xe9, 0x32, 0xbb, + 0xcc, 0xf7, 0x92, 0x69, 0xa2, 0xf0, 0xc3, 0xfa, 0xd2, 0x31, 0x8b, 0x43, + 0x4e, 0x03, 0xe2, 0x13, 0x79, 0x6e, 0x73, 0x63, 0x3b, 0x45, 0xde, 0x80, + 0xf4, 0x26, 0xb1, 0x38, 0xed, 0x62, 0x55, 0xc6, 0x6a, 0x67, 0x00, 0x2d, + 0xba, 0xb2, 0xc5, 0xb6, 0x97, 0x62, 0x28, 0x64, 0x30, 0xb9, 0xfb, 0x3f, + 0x94, 0x03, 0x48, 0x36, 0x2c, 0x5d, 0xfd, 0x08, 0x96, 0x40, 0xd1, 0x6c, + 0xe5, 0xd0, 0xf8, 0x99, 0x40, 0x82, 0x87, 0xd7, 0xdc, 0x2f, 0x8b, 0xaa, + 0x31, 0x96, 0x0a, 0x34, 0x33, 0xa6, 0xf1, 0x84, 0x6e, 0x33, 0x73, 0xc5, + 0xe3, 0x26, 0xad, 0xd0, 0xcb, 0x62, 0x71, 0x82, 0xab, 0xd1, 0x82, 0x33, + 0xe6, 0xca, 0xd0, 0x3e, 0xf5, 0x4d, 0x12, 0x6e, 0xf1, 0x83, 0xbd, 0xdc, + 0x4d, 0xdf, 0x49, 0xbc, 0x63, 0xae, 0x7e, 0x59, 0xe8, 0x3c, 0x0d, 0xd6, + 0x1d, 0x41, 0x89, 0x72, 0x52, 0xc0, 0xae, 0xd1, 0x2f, 0x0a, 0x8a, 0xce, + 0x26, 0xd0, 0x3e, 0x0c, 0x71, 0x32, 0x52, 0xb2, 0xe4, 0xee, 0xa2, 0xe5, + 0x28, 0xb6, 0x33, 0x69, 0x97, 0x5a, 0x53, 0xdb, 0x56, 0x63, 0xe9, 0xb3, + 0x6d, 0x60, 0xf4, 0x7a, 0xce, 0xec, 0x36, 0x65, 0xd5, 0xca, 0x63, 0x2a, + 0x19, 0x90, 0x14, 0x7b, 0x02, 0x33, 0xfa, 0x11, 0x58, 0x5a, 0xd9, 0xc5, + 0x54, 0xf3, 0x28, 0xd5, 0x6e, 0xea, 0x85, 0xf5, 0x09, 0xbb, 0x81, 0x44, + 0x1c, 0x63, 0x66, 0x81, 0xc5, 0x96, 0x2d, 0x7c, 0x0e, 0x75, 0x7b, 0xb4, + 0x7e, 0x4e, 0x0c, 0xfd, 0x3c, 0xc5, 0x5a, 0x22, 0x85, 0x5c, 0xc8, 0xf3, + 0x97, 0x98, 0x2c, 0xe9, 0x46, 0xb4, 0x02, 0xcf, 0x7d, 0xa4, 0xf2, 0x44, + 0x7a, 0x89, 0x71, 0xa0, 0xfa, 0xb6, 0xa3, 0xaf, 0x13, 0x25, 0x46, 0xe2, + 0x64, 0xe3, 0x69, 0xba, 0xf9, 0x68, 0x5c, 0xc0, 0xb7, 0xa8, 0xa6, 0x4b, + 0xe1, 0x42, 0xe9, 0xb5, 0xc7, 0x84, 0xbb, 0xa6, 0x4b, 0x10, 0x4e, 0xd4, + 0x68, 0x70, 0x0a, 0x75, 0x2a, 0xbb, 0x9d, 0xa0, 0xcb, 0xf0, 0x36, 0x4c, + 0x70, 0x6c, 0x60, 0x4d, 0xfe, 0xe8, 0xc8, 0x66, 0x80, 0x1b, 0xf7, 0xcc, + 0x1a, 0xdd, 0x6b, 0xa7, 0xa7, 0x25, 0x61, 0x0c, 0x31, 0xf0, 0x34, 0x63, + 0x00, 0x0e, 0x48, 0x6a, 0x5a, 0x8d, 0x47, 0x94, 0x3f, 0x14, 0x16, 0xa8, + 0x8a, 0x49, 0xbb, 0x0c, 0x43, 0x21, 0xda, 0xf2, 0xc5, 0xd0, 0xff, 0x19, + 0x3e, 0x36, 0x64, 0x20, 0xb3, 0x70, 0xae, 0x54, 0xca, 0x73, 0x05, 0x56, + 0x7a, 0x49, 0x45, 0xe9, 0x46, 0xbc, 0xc2, 0x61, 0x70, 0x40, 0x7c, 0xb0, + 0xf7, 0xea, 0xc0, 0xd1, 0xb0, 0x77, 0x2c, 0xc7, 0xdd, 0x88, 0xcb, 0x9d, + 0xea, 0x55, 0x6c, 0x5c, 0x28, 0xb8, 0x84, 0x1c, 0x2c, 0x06, + }; + uint8_t pub_bytes[HRSS_PUBLIC_KEY_BYTES]; + HRSS_marshal_public_key(pub_bytes, &pub); + EXPECT_EQ(Bytes(pub_bytes), Bytes(kExpectedPub)); + + uint8_t ciphertext[HRSS_CIPHERTEXT_BYTES]; + uint8_t shared_key[HRSS_KEY_BYTES]; + OPENSSL_STATIC_ASSERT( + sizeof(kExpectedPub) >= HRSS_ENCAP_BYTES, + "Private key too small to use as input to HRSS encapsulation"); + HRSS_encap(ciphertext, shared_key, &pub, kExpectedPub); + + static const uint8_t kExpectedCiphertext[HRSS_CIPHERTEXT_BYTES] = { + 0x8e, 0x6b, 0x46, 0x9d, 0x4a, 0xef, 0xa6, 0x8c, 0x28, 0x7b, 0xec, 0x6f, + 0x13, 0x2d, 0x7f, 0x6c, 0xca, 0x7d, 0x9e, 0x6b, 0x54, 0x62, 0xa3, 0x13, + 0xe1, 0x1e, 0x8f, 0x5f, 0x71, 0x67, 0xc4, 0x85, 0xdf, 0xd5, 0x6b, 0xbd, + 0x86, 0x0f, 0x98, 0xec, 0xa5, 0x04, 0xf7, 0x7b, 0x2a, 0xbe, 0xcb, 0xac, + 0x29, 0xbe, 0xe1, 0x0f, 0xbc, 0x62, 0x87, 0x85, 0x7f, 0x05, 0xae, 0xe4, + 0x3f, 0x87, 0xfc, 0x1f, 0xf7, 0x45, 0x1e, 0xa3, 0xdb, 0xb1, 0xa0, 0x25, + 0xba, 0x82, 0xec, 0xca, 0x8d, 0xab, 0x7a, 0x20, 0x03, 0xeb, 0xe5, 0x5c, + 0x9f, 0xd0, 0x46, 0x78, 0xf1, 0x5a, 0xc7, 0x9e, 0xb4, 0x10, 0x6d, 0x37, + 0xc0, 0x75, 0x08, 0xfb, 0xeb, 0xcb, 0xd8, 0x35, 0x21, 0x9b, 0x89, 0xa0, + 0xaa, 0x87, 0x00, 0x66, 0x38, 0x37, 0x68, 0xa4, 0xa3, 0x93, 0x8e, 0x2b, + 0xca, 0xf7, 0x7a, 0x43, 0xb2, 0x15, 0x79, 0x81, 0xce, 0xa9, 0x09, 0xcb, + 0x29, 0xd4, 0xcc, 0xef, 0xf1, 0x9b, 0xbd, 0xe6, 0x63, 0xd5, 0x26, 0x0f, + 0xe8, 0x8b, 0xdf, 0xf1, 0xc3, 0xb4, 0x18, 0x0e, 0xf2, 0x1d, 0x5d, 0x82, + 0x9b, 0x1f, 0xf3, 0xca, 0x36, 0x2a, 0x26, 0x0a, 0x7f, 0xc4, 0x0d, 0xbd, + 0x5b, 0x15, 0x1c, 0x18, 0x6c, 0x11, 0x4e, 0xec, 0x36, 0x01, 0xc1, 0x15, + 0xab, 0xf7, 0x0b, 0x1a, 0xd3, 0xa1, 0xbd, 0x68, 0xc8, 0x59, 0xe7, 0x49, + 0x5c, 0xd5, 0x4b, 0x8c, 0x31, 0xdb, 0xb3, 0xea, 0x88, 0x09, 0x2f, 0xb9, + 0x8b, 0xfd, 0x96, 0x35, 0x88, 0x53, 0x72, 0x40, 0xcd, 0x89, 0x75, 0xb4, + 0x20, 0xf6, 0xf6, 0xe5, 0x74, 0x19, 0x48, 0xaf, 0x4b, 0xaa, 0x42, 0xa4, + 0xc8, 0x90, 0xee, 0xf3, 0x12, 0x04, 0x63, 0x90, 0x92, 0x8a, 0x89, 0xc3, + 0xa0, 0x7e, 0xfe, 0x19, 0xb3, 0x54, 0x53, 0x83, 0xe9, 0xc1, 0x6c, 0xe3, + 0x97, 0xa6, 0x27, 0xc3, 0x20, 0x9a, 0x79, 0x35, 0xc9, 0xb5, 0xc0, 0x90, + 0xe1, 0x56, 0x84, 0x69, 0xc2, 0x54, 0x77, 0x52, 0x48, 0x55, 0x71, 0x3e, + 0xcd, 0xa7, 0xd6, 0x25, 0x5d, 0x49, 0x13, 0xd2, 0x59, 0xd7, 0xe1, 0xd1, + 0x70, 0x46, 0xa0, 0xd4, 0xee, 0x59, 0x13, 0x1f, 0x1a, 0xd3, 0x39, 0x7d, + 0xb0, 0x79, 0xf7, 0xc0, 0x73, 0x5e, 0xbb, 0x08, 0xf7, 0x5c, 0xb0, 0x31, + 0x41, 0x3d, 0x7b, 0x1e, 0xf0, 0xe6, 0x47, 0x5c, 0x37, 0xd5, 0x54, 0xf1, + 0xbb, 0x64, 0xd7, 0x41, 0x8b, 0x34, 0x55, 0xaa, 0xc3, 0x5a, 0x9c, 0xa0, + 0xcc, 0x29, 0x8e, 0x5a, 0x1a, 0x93, 0x5a, 0x49, 0xd3, 0xd0, 0xa0, 0x56, + 0xda, 0x32, 0xa2, 0xa9, 0xa7, 0x13, 0x42, 0x93, 0x9b, 0x20, 0x32, 0x37, + 0x5c, 0x3e, 0x03, 0xa5, 0x28, 0x10, 0x93, 0xdd, 0xa0, 0x04, 0x7b, 0x2a, + 0xbd, 0x31, 0xc3, 0x6a, 0x89, 0x58, 0x6e, 0x55, 0x0e, 0xc9, 0x5c, 0x70, + 0x07, 0x10, 0xf1, 0x9a, 0xbd, 0xfb, 0xd2, 0xb7, 0x94, 0x5b, 0x4f, 0x8d, + 0x90, 0xfa, 0xee, 0xae, 0x37, 0x48, 0xc5, 0xf8, 0x16, 0xa1, 0x3b, 0x70, + 0x03, 0x1f, 0x0e, 0xb8, 0xbd, 0x8d, 0x30, 0x4f, 0x95, 0x31, 0x0b, 0x9f, + 0xfc, 0x80, 0xf8, 0xef, 0xa3, 0x3c, 0xbc, 0xe2, 0x23, 0x23, 0x3e, 0x2a, + 0x55, 0x11, 0xe8, 0x2c, 0x17, 0xea, 0x1c, 0xbd, 0x1d, 0x2d, 0x1b, 0xd5, + 0x16, 0x9e, 0x05, 0xfc, 0x89, 0x64, 0x50, 0x4d, 0x9a, 0x22, 0x50, 0xc6, + 0x5a, 0xd9, 0x58, 0x99, 0x8f, 0xbd, 0xf2, 0x4f, 0x2c, 0xdb, 0x51, 0x6a, + 0x86, 0xe2, 0xc6, 0x64, 0x8f, 0x54, 0x1a, 0xf2, 0xcb, 0x34, 0x88, 0x08, + 0xbd, 0x2a, 0x8f, 0xec, 0x29, 0xf5, 0x22, 0x36, 0x83, 0x99, 0xb9, 0x71, + 0x8c, 0x99, 0x5c, 0xec, 0x91, 0x78, 0xc1, 0xe2, 0x2d, 0xe9, 0xd1, 0x4d, + 0xf5, 0x15, 0x93, 0x4d, 0x93, 0x92, 0x9f, 0x0f, 0x33, 0x5e, 0xcd, 0x58, + 0x5f, 0x3d, 0x52, 0xb9, 0x38, 0x6a, 0x85, 0x63, 0x8b, 0x63, 0x29, 0xcb, + 0x67, 0x12, 0x25, 0xc2, 0x44, 0xd7, 0xab, 0x1a, 0x24, 0xca, 0x3d, 0xca, + 0x77, 0xce, 0x28, 0x68, 0x1a, 0x91, 0xed, 0x7b, 0xc9, 0x70, 0x84, 0xab, + 0xe2, 0xd4, 0xf4, 0xac, 0x58, 0xf6, 0x70, 0x99, 0xfc, 0x99, 0x4d, 0xbd, + 0xb4, 0x1b, 0x4f, 0x15, 0x86, 0x95, 0x08, 0xd1, 0x4e, 0x73, 0xa9, 0xbc, + 0x6a, 0x8c, 0xbc, 0xb5, 0x4b, 0xe0, 0xee, 0x35, 0x24, 0xf9, 0x12, 0xf5, + 0x88, 0x70, 0x50, 0x6c, 0xfe, 0x0d, 0x35, 0xbd, 0xf7, 0xc4, 0x2e, 0x39, + 0x16, 0x30, 0x6c, 0xf3, 0xb2, 0x19, 0x44, 0xaa, 0xcb, 0x4a, 0xf6, 0x75, + 0xb7, 0x09, 0xb9, 0xe1, 0x47, 0x71, 0x70, 0x5c, 0x05, 0x5f, 0x50, 0x50, + 0x9c, 0xd0, 0xe3, 0xc7, 0x91, 0xee, 0x6b, 0xc7, 0x0f, 0x71, 0x1b, 0xc3, + 0x48, 0x8b, 0xed, 0x15, 0x26, 0x8c, 0xc3, 0xd5, 0x54, 0x08, 0xcc, 0x33, + 0x79, 0xc0, 0x9f, 0x49, 0xc8, 0x75, 0xef, 0xb6, 0xf3, 0x29, 0x89, 0xfd, + 0x75, 0xd1, 0xda, 0x92, 0xc3, 0x13, 0xc6, 0x76, 0x51, 0x11, 0x40, 0x7b, + 0x82, 0xf7, 0x30, 0x79, 0x49, 0x04, 0xe3, 0xbb, 0x61, 0x34, 0xa6, 0x58, + 0x0b, 0x7d, 0xef, 0x3e, 0xf9, 0xb3, 0x8d, 0x2a, 0xba, 0xe9, 0xbc, 0xc0, + 0xa7, 0xe6, 0x6c, 0xda, 0xf8, 0x8c, 0xdf, 0x8d, 0x96, 0x83, 0x2d, 0x80, + 0x4f, 0x21, 0x81, 0xde, 0x57, 0x9d, 0x0a, 0x3c, 0xcc, 0xec, 0x3b, 0xb2, + 0x25, 0x96, 0x3c, 0xea, 0xfd, 0x46, 0x26, 0xbe, 0x1c, 0x79, 0x82, 0x1d, + 0xe0, 0x14, 0x22, 0x7c, 0x80, 0x3d, 0xbd, 0x05, 0x90, 0xfa, 0xaf, 0x7d, + 0x70, 0x13, 0x43, 0x0f, 0x3d, 0xa0, 0x7f, 0x92, 0x3a, 0x53, 0x69, 0xe4, + 0xb0, 0x10, 0x0d, 0xa7, 0x73, 0xa8, 0x8c, 0x74, 0xab, 0xd7, 0x78, 0x15, + 0x45, 0xec, 0x6e, 0xc8, 0x8b, 0xa0, 0xba, 0x21, 0x6f, 0xf3, 0x08, 0xb8, + 0xc7, 0x4f, 0x14, 0xf5, 0xcc, 0xfd, 0x39, 0xbc, 0x11, 0xf5, 0xb9, 0x11, + 0xba, 0xf3, 0x11, 0x24, 0x74, 0x3e, 0x0c, 0x07, 0x4f, 0xac, 0x2a, 0xb2, + 0xb1, 0x3c, 0x00, 0xfa, 0xbb, 0x8c, 0xd8, 0x7d, 0x17, 0x5b, 0x8d, 0x39, + 0xc6, 0x23, 0x31, 0x32, 0x7d, 0x6e, 0x20, 0x38, 0xd0, 0xc3, 0x58, 0xe2, + 0xb1, 0xfe, 0x53, 0x6b, 0xc7, 0x10, 0x13, 0x7e, 0xc6, 0x7c, 0x67, 0x59, + 0x43, 0x70, 0x4a, 0x2d, 0x7f, 0x76, 0xde, 0xbd, 0x45, 0x43, 0x56, 0x60, + 0xcd, 0xe9, 0x24, 0x7b, 0xb7, 0x41, 0xce, 0x56, 0xed, 0xd3, 0x74, 0x75, + 0xcc, 0x9d, 0x48, 0x61, 0xc8, 0x19, 0x66, 0x08, 0xfb, 0x28, 0x60, 0x1f, + 0x83, 0x11, 0xc0, 0x9b, 0xbd, 0x71, 0x53, 0x36, 0x01, 0x76, 0xa8, 0xc0, + 0xdc, 0x1d, 0x18, 0x85, 0x19, 0x65, 0xce, 0xcf, 0x14, 0x2e, 0x6c, 0x32, + 0x15, 0xbc, 0x2c, 0x5e, 0x8f, 0xfc, 0x3c, 0xf0, 0x2d, 0xf5, 0x5c, 0x04, + 0xc9, 0x22, 0xf4, 0xc3, 0xb8, 0x57, 0x79, 0x52, 0x41, 0xfd, 0xff, 0xcd, + 0x26, 0xa8, 0xc0, 0xd2, 0xe1, 0x71, 0xd6, 0xf1, 0xf4, 0x0c, 0xa8, 0xeb, + 0x0c, 0x33, 0x40, 0x25, 0x73, 0xbb, 0x31, 0xda, 0x0c, 0xa6, 0xee, 0x0c, + 0x41, 0x51, 0x94, 0x3c, 0x24, 0x27, 0x65, 0xe9, 0xb5, 0xc4, 0xe2, 0x88, + 0xc0, 0x82, 0xd0, 0x72, 0xd9, 0x10, 0x4d, 0x7f, 0xc0, 0x88, 0x94, 0x41, + 0x2d, 0x05, 0x09, 0xfb, 0x97, 0x31, 0x6e, 0xc1, 0xe9, 0xf4, 0x50, 0x70, + 0xdc, 0x3f, 0x0a, 0x90, 0x46, 0x37, 0x60, 0x8c, 0xfb, 0x06, 0x6e, 0xde, + 0x6f, 0xa7, 0x6b, 0xa3, 0x88, 0x18, 0x96, 0x93, 0x19, 0x87, 0xe7, 0x0a, + 0x98, 0xf0, 0x13, 0x01, 0xab, 0x7c, 0xeb, 0x25, 0xa5, 0xe2, 0x98, 0x44, + 0x7d, 0x09, 0xe2, 0x42, 0x33, 0xd4, 0xeb, 0xcc, 0x9b, 0x70, 0xf6, 0x0f, + 0xf0, 0xb2, 0x99, 0xcc, 0x4f, 0x64, 0xc4, 0x69, 0x12, 0xea, 0x56, 0xfe, + 0x50, 0x0e, 0x02, 0x1f, 0x6d, 0x7a, 0x79, 0x62, 0xaa, 0x2e, 0x52, 0xaf, + 0xa3, 0xed, 0xcd, 0xa7, 0x45, 0xe6, 0x86, 0xed, 0xa1, 0x73, 0x5b, 0x1e, + 0x49, 0x4f, 0x92, 0x50, 0x83, 0x99, 0x3c, 0xf4, 0xf6, 0xa8, 0x49, 0xd7, + 0x08, 0xf7, 0xdc, 0x28, 0x2c, 0xe6, 0x22, 0x6f, 0xf8, 0xfa, 0xba, 0x9e, + 0x0a, 0xcf, 0x72, 0x74, 0x76, 0x75, 0x99, 0x4d, 0x3d, 0x9a, 0x4c, 0x54, + 0xcd, 0xf8, 0x54, 0xf0, 0xbd, 0x73, 0xe9, 0x4f, 0x29, 0xd0, 0xe1, 0x24, + 0x94, 0x52, 0xd6, 0x60, 0x80, 0x71, 0x24, 0x95, 0x92, 0x01, 0x0e, 0xa9, + 0x7e, 0x64, 0x2e, 0xed, 0x51, 0xcc, 0xd2, 0xff, 0xfd, 0x0b, 0xf4, 0x1d, + 0x25, 0x5d, 0x10, 0x87, 0x09, 0x55, 0x06, 0x95, 0xae, 0xb3, 0xef, 0xe9, + 0xaa, 0x36, 0x15, 0x97, 0xe6, 0xf2, 0x24, 0xcf, 0x7d, 0xcd, 0x55, 0x11, + 0xba, 0x20, 0xd0, 0xd7, 0xdc, 0xa6, + }; + EXPECT_EQ(Bytes(ciphertext), Bytes(kExpectedCiphertext)); + + static const uint8_t kExpectedSharedKey[HRSS_KEY_BYTES] = { + 0x04, 0x5a, 0x1a, 0xbc, 0x4c, 0x76, 0x47, 0x1f, 0xbf, 0xc9, 0x23, + 0xec, 0xcb, 0x6e, 0x4d, 0x59, 0x8d, 0x3f, 0x90, 0x3e, 0x53, 0x73, + 0x3c, 0x2c, 0x71, 0xcc, 0xac, 0xc5, 0xe0, 0xf2, 0xbc, 0xe8, + }; + EXPECT_EQ(Bytes(shared_key), Bytes(kExpectedSharedKey)); + + HRSS_decap(shared_key, &pub, &priv, ciphertext, sizeof(ciphertext)); + EXPECT_EQ(Bytes(shared_key, sizeof(shared_key)), + Bytes(kExpectedSharedKey, sizeof(kExpectedSharedKey))); + + // Corrupt the ciphertext and ensure that the failure key is constant. + ciphertext[50] ^= 4; + HRSS_decap(shared_key, &pub, &priv, ciphertext, sizeof(ciphertext)); + + static const uint8_t kExpectedFailureKey[HRSS_KEY_BYTES] = { + 0x3a, 0xec, 0xc0, 0x38, 0x4f, 0xa7, 0x17, 0xb2, 0x77, 0x61, 0xb1, + 0xf8, 0x12, 0x7f, 0xd9, 0x61, 0x67, 0x70, 0x63, 0xbe, 0xa2, 0x72, + 0xfe, 0x1a, 0x82, 0x8d, 0x1d, 0x90, 0xe0, 0x36, 0x69, 0x2d, + }; + EXPECT_EQ(Bytes(shared_key), Bytes(kExpectedFailureKey)); +} diff --git a/crypto/hrss/internal.h b/crypto/hrss/internal.h new file mode 100644 index 00000000..70218b88 --- /dev/null +++ b/crypto/hrss/internal.h @@ -0,0 +1,50 @@ +/* Copyright (c) 2018, Google Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#ifndef OPENSSL_HEADER_HRSS_INTERNAL_H +#define OPENSSL_HEADER_HRSS_INTERNAL_H + +#include +#include "../internal.h" + +#if defined(__cplusplus) +extern "C" { +#endif + + +#define N 701 +#define BITS_PER_WORD (sizeof(crypto_word_t) * 8) +#define WORDS_PER_POLY ((N + BITS_PER_WORD - 1) / BITS_PER_WORD) +#define BITS_IN_LAST_WORD (N % BITS_PER_WORD) + +struct poly2 { + crypto_word_t v[WORDS_PER_POLY]; +}; + +struct poly3 { + struct poly2 s, a; +}; + +OPENSSL_EXPORT void HRSS_poly2_rotr_consttime(struct poly2 *p, size_t bits); +OPENSSL_EXPORT void HRSS_poly3_mul(struct poly3 *out, const struct poly3 *x, + const struct poly3 *y); +OPENSSL_EXPORT void HRSS_poly3_invert(struct poly3 *out, + const struct poly3 *in); + + +#if defined(__cplusplus) +} // extern "C" +#endif + +#endif // !OPENSSL_HEADER_HRSS_INTERNAL_H diff --git a/crypto/obj/obj_dat.h b/crypto/obj/obj_dat.h index 0f5a3fa0..0313a08a 100644 --- a/crypto/obj/obj_dat.h +++ b/crypto/obj/obj_dat.h @@ -57,7 +57,7 @@ /* This file is generated by crypto/obj/objects.go. */ -#define NUM_NID 959 +#define NUM_NID 960 static const uint8_t kObjectData[] = { /* NID_rsadsi */ @@ -8755,6 +8755,7 @@ static const ASN1_OBJECT kObjects[NUM_NID] = { {"AuthPSK", "auth-psk", NID_auth_psk, 0, NULL, 0}, {"KxANY", "kx-any", NID_kx_any, 0, NULL, 0}, {"AuthANY", "auth-any", NID_auth_any, 0, NULL, 0}, + {"CECPQ2", "CECPQ2", NID_CECPQ2, 0, NULL, 0}, }; static const unsigned kNIDsInShortNameOrder[] = { @@ -8816,6 +8817,7 @@ static const unsigned kNIDsInShortNameOrder[] = { 110 /* CAST5-CFB */, 109 /* CAST5-ECB */, 111 /* CAST5-OFB */, + 959 /* CECPQ2 */, 894 /* CMAC */, 13 /* CN */, 141 /* CRLReason */, @@ -9720,6 +9722,7 @@ static const unsigned kNIDsInLongNameOrder[] = { 285 /* Biometric Info */, 179 /* CA Issuers */, 785 /* CA Repository */, + 959 /* CECPQ2 */, 131 /* Code Signing */, 783 /* Diffie-Hellman based MAC */, 382 /* Directory */, diff --git a/crypto/obj/obj_mac.num b/crypto/obj/obj_mac.num index 6dbc0f13..5fa839d2 100644 --- a/crypto/obj/obj_mac.num +++ b/crypto/obj/obj_mac.num @@ -947,3 +947,4 @@ auth_ecdsa 955 auth_psk 956 kx_any 957 auth_any 958 +CECPQ2 959 diff --git a/crypto/obj/objects.txt b/crypto/obj/objects.txt index 0c48e3c0..6dbb7ad7 100644 --- a/crypto/obj/objects.txt +++ b/crypto/obj/objects.txt @@ -559,7 +559,7 @@ id-cmc 19 : id-cmc-responseInfo id-cmc 21 : id-cmc-queryPending id-cmc 22 : id-cmc-popLinkRandom id-cmc 23 : id-cmc-popLinkWitness -id-cmc 24 : id-cmc-confirmCertAcceptance +id-cmc 24 : id-cmc-confirmCertAcceptance # other names id-on 1 : id-on-personalData @@ -1239,7 +1239,7 @@ cryptocom 1 8 1 : id-GostR3410-2001-ParamSet-cc : GOST R 3410-2001 Parameter Se # Definitions for Camellia cipher - ECB, CFB, OFB MODE !Alias ntt-ds 0 3 4401 5 -!Alias camellia ntt-ds 3 1 9 +!Alias camellia ntt-ds 3 1 9 camellia 1 : CAMELLIA-128-ECB : camellia-128-ecb !Cname camellia-128-ofb128 @@ -1310,7 +1310,7 @@ ISO-US 10046 2 1 : dhpublicnumber : X9.42 DH 1 3 36 3 3 2 8 1 1 11 : brainpoolP384r1 1 3 36 3 3 2 8 1 1 12 : brainpoolP384t1 1 3 36 3 3 2 8 1 1 13 : brainpoolP512r1 -1 3 36 3 3 2 8 1 1 14 : brainpoolP512t1 +1 3 36 3 3 2 8 1 1 14 : brainpoolP512t1 # ECDH schemes from RFC5753 !Alias x9-63-scheme 1 3 133 16 840 63 0 @@ -1334,6 +1334,9 @@ secg-scheme 14 3 : dhSinglePass-cofactorDH-sha512kdf-scheme # NID for X25519 (no corresponding OID). : X25519 +# NID for CECPQ2 (no corresponding OID). + : CECPQ2 + # See RFC 8410. 1 3 101 112 : ED25519 diff --git a/include/openssl/hrss.h b/include/openssl/hrss.h new file mode 100644 index 00000000..4e1c73ff --- /dev/null +++ b/include/openssl/hrss.h @@ -0,0 +1,102 @@ +/* Copyright (c) 2018, Google Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#ifndef OPENSSL_HEADER_HRSS_H +#define OPENSSL_HEADER_HRSS_H + +#include + +#if defined(__cplusplus) +extern "C" { +#endif + +// HRSS +// +// HRSS is a structured-lattice-based post-quantum key encapsulation mechanism. +// The best exposition is https://eprint.iacr.org/2017/667.pdf although this +// implementation uses a different KEM construction based on +// https://eprint.iacr.org/2017/1005.pdf. + +struct HRSS_private_key { + uint8_t opaque[1808]; +}; + +struct HRSS_public_key { + uint8_t opaque[1424]; +}; + +// HRSS_SAMPLE_BYTES is the number of bytes of entropy needed to generate a +// short vector. There are 701 coefficients, but the final one is always set to +// zero when sampling. Otherwise, one byte of input is enough to generate two +// coefficients. +#define HRSS_SAMPLE_BYTES ((701 - 1) / 2) +// HRSS_GENERATE_KEY_BYTES is the number of bytes of entropy needed to generate +// an HRSS key pair. +#define HRSS_GENERATE_KEY_BYTES (HRSS_SAMPLE_BYTES + HRSS_SAMPLE_BYTES + 32) +// HRSS_ENCAP_BYTES is the number of bytes of entropy needed to encapsulate a +// session key. +#define HRSS_ENCAP_BYTES (HRSS_SAMPLE_BYTES + HRSS_SAMPLE_BYTES) +// HRSS_PUBLIC_KEY_BYTES is the number of bytes in a public key. +#define HRSS_PUBLIC_KEY_BYTES 1138 +// HRSS_CIPHERTEXT_BYTES is the number of bytes in a ciphertext. +#define HRSS_CIPHERTEXT_BYTES (1138 + 32) +// HRSS_KEY_BYTES is the number of bytes in a shared key. +#define HRSS_KEY_BYTES 32 +// HRSS_POLY3_BYTES is the number of bytes needed to serialise a mod 3 +// polynomial. +#define HRSS_POLY3_BYTES 140 +#define HRSS_PRIVATE_KEY_BYTES \ + (HRSS_POLY3_BYTES * 2 + HRSS_PUBLIC_KEY_BYTES + 2 + 32) + +// HRSS_generate_key is a deterministic function that outputs a public and +// private key based on the given entropy. +OPENSSL_EXPORT void HRSS_generate_key( + struct HRSS_public_key *out_pub, struct HRSS_private_key *out_priv, + const uint8_t input[HRSS_GENERATE_KEY_BYTES]); + +// HRSS_encap is a deterministic function the generates and encrypts a random +// session key from the given entropy, writing those values to |out_shared_key| +// and |out_ciphertext|, respectively. +OPENSSL_EXPORT void HRSS_encap(uint8_t out_ciphertext[HRSS_CIPHERTEXT_BYTES], + uint8_t out_shared_key[HRSS_KEY_BYTES], + const struct HRSS_public_key *in_pub, + const uint8_t in[HRSS_ENCAP_BYTES]); + +// HRSS_decap decrypts a session key from |ciphertext_len| bytes of +// |ciphertext|. If the ciphertext is valid, the decrypted key is written to +// |out_shared_key|. Otherwise the HMAC of |ciphertext| under a secret key (kept +// in |in_priv|) is written. If the ciphertext is the wrong length then it will +// leak which was done via side-channels. Otherwise it should perform either +// action in constant-time. +OPENSSL_EXPORT void HRSS_decap(uint8_t out_shared_key[HRSS_KEY_BYTES], + const struct HRSS_public_key *in_pub, + const struct HRSS_private_key *in_priv, + const uint8_t *ciphertext, + size_t ciphertext_len); + +// HRSS_marshal_public_key serialises |in_pub| to |out|. +OPENSSL_EXPORT void HRSS_marshal_public_key( + uint8_t out[HRSS_PUBLIC_KEY_BYTES], const struct HRSS_public_key *in_pub); + +// HRSS_parse_public_key sets |*out| to the public-key encoded in |in|. It +// returns true on success and zero on error. +OPENSSL_EXPORT int HRSS_parse_public_key( + struct HRSS_public_key *out, const uint8_t in[HRSS_PUBLIC_KEY_BYTES]); + + +#if defined(__cplusplus) +} // extern C +#endif + +#endif // OPENSSL_HEADER_HRSS_H diff --git a/include/openssl/nid.h b/include/openssl/nid.h index afeb2dea..270d443a 100644 --- a/include/openssl/nid.h +++ b/include/openssl/nid.h @@ -4234,6 +4234,9 @@ extern "C" { #define LN_auth_any "auth-any" #define NID_auth_any 958 +#define SN_CECPQ2 "CECPQ2" +#define NID_CECPQ2 959 + #if defined(__cplusplus) } /* extern C */ diff --git a/include/openssl/ssl.h b/include/openssl/ssl.h index 17c55925..2f8163ab 100644 --- a/include/openssl/ssl.h +++ b/include/openssl/ssl.h @@ -2177,6 +2177,7 @@ OPENSSL_EXPORT int SSL_set1_curves_list(SSL *ssl, const char *curves); #define SSL_CURVE_SECP384R1 24 #define SSL_CURVE_SECP521R1 25 #define SSL_CURVE_X25519 29 +#define SSL_CURVE_CECPQ2 16696 // SSL_get_curve_id returns the ID of the curve used by |ssl|'s most recently // completed handshake or 0 if not applicable. diff --git a/ssl/handoff.cc b/ssl/handoff.cc index 4cca9818..f9dbd135 100644 --- a/ssl/handoff.cc +++ b/ssl/handoff.cc @@ -307,7 +307,7 @@ bool SSL_serialize_handback(const SSL *ssl, CBB *out) { return false; } if (type == handback_after_ecdhe && - !s3->hs->key_share->Serialize(&key_share)) { + !s3->hs->key_shares[0]->Serialize(&key_share)) { return false; } return CBB_flush(out); @@ -471,7 +471,7 @@ bool SSL_apply_handback(SSL *ssl, Span handback) { return false; } if (type == handback_after_ecdhe && - (s3->hs->key_share = SSLKeyShare::Create(&key_share)) == nullptr) { + (s3->hs->key_shares[0] = SSLKeyShare::Create(&key_share)) == nullptr) { return false; } diff --git a/ssl/handshake_client.cc b/ssl/handshake_client.cc index c1d54bd8..0274dc2a 100644 --- a/ssl/handshake_client.cc +++ b/ssl/handshake_client.cc @@ -590,7 +590,8 @@ static enum ssl_hs_wait_t do_read_server_hello(SSL_HANDSHAKE *hs) { } // Clear some TLS 1.3 state that no longer needs to be retained. - hs->key_share.reset(); + hs->key_shares[0].reset(); + hs->key_shares[1].reset(); hs->key_share_bytes.Reset(); // A TLS 1.2 server would not know to skip the early data we offered. Report @@ -1006,8 +1007,8 @@ static enum ssl_hs_wait_t do_read_server_key_exchange(SSL_HANDSHAKE *hs) { } // Initialize ECDH and save the peer public key for later. - hs->key_share = SSLKeyShare::Create(group_id); - if (!hs->key_share || + hs->key_shares[0] = SSLKeyShare::Create(group_id); + if (!hs->key_shares[0] || !hs->peer_key.CopyFrom(point)) { return ssl_hs_error; } @@ -1324,7 +1325,7 @@ static enum ssl_hs_wait_t do_send_client_key_exchange(SSL_HANDSHAKE *hs) { // Compute the premaster. uint8_t alert = SSL_AD_DECODE_ERROR; - if (!hs->key_share->Accept(&child, &pms, &alert, hs->peer_key)) { + if (!hs->key_shares[0]->Accept(&child, &pms, &alert, hs->peer_key)) { ssl_send_alert(ssl, SSL3_AL_FATAL, alert); return ssl_hs_error; } @@ -1333,7 +1334,8 @@ static enum ssl_hs_wait_t do_send_client_key_exchange(SSL_HANDSHAKE *hs) { } // The key exchange state may now be discarded. - hs->key_share.reset(); + hs->key_shares[0].reset(); + hs->key_shares[1].reset(); hs->peer_key.Reset(); } else if (alg_k & SSL_kPSK) { // For plain PSK, other_secret is a block of 0s with the same length as diff --git a/ssl/handshake_server.cc b/ssl/handshake_server.cc index c4f3b75e..8b3b9428 100644 --- a/ssl/handshake_server.cc +++ b/ssl/handshake_server.cc @@ -932,12 +932,12 @@ static enum ssl_hs_wait_t do_send_server_certificate(SSL_HANDSHAKE *hs) { hs->new_session->group_id = group_id; // Set up ECDH, generate a key, and emit the public half. - hs->key_share = SSLKeyShare::Create(group_id); - if (!hs->key_share || + hs->key_shares[0] = SSLKeyShare::Create(group_id); + if (!hs->key_shares[0] || !CBB_add_u8(cbb.get(), NAMED_CURVE_TYPE) || !CBB_add_u16(cbb.get(), group_id) || !CBB_add_u8_length_prefixed(cbb.get(), &child) || - !hs->key_share->Offer(&child)) { + !hs->key_shares[0]->Offer(&child)) { return ssl_hs_error; } } else { @@ -1275,13 +1275,14 @@ static enum ssl_hs_wait_t do_read_client_key_exchange(SSL_HANDSHAKE *hs) { // Compute the premaster. uint8_t alert = SSL_AD_DECODE_ERROR; - if (!hs->key_share->Finish(&premaster_secret, &alert, peer_key)) { + if (!hs->key_shares[0]->Finish(&premaster_secret, &alert, peer_key)) { ssl_send_alert(ssl, SSL3_AL_FATAL, alert); return ssl_hs_error; } // The key exchange state may now be discarded. - hs->key_share.reset(); + hs->key_shares[0].reset(); + hs->key_shares[1].reset(); } else if (!(alg_k & SSL_kPSK)) { OPENSSL_PUT_ERROR(SSL, ERR_R_INTERNAL_ERROR); ssl_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_HANDSHAKE_FAILURE); diff --git a/ssl/internal.h b/ssl/internal.h index f8a2ea70..bbce7ec4 100644 --- a/ssl/internal.h +++ b/ssl/internal.h @@ -974,10 +974,10 @@ class SSLKeyShare { // |out_public_key|. It returns true on success and false on error. virtual bool Offer(CBB *out_public_key) PURE_VIRTUAL; - // Accept performs a key exchange against the |peer_key| generated by |offer|. + // Accept performs a key exchange against the |peer_key| generated by |Offer|. // On success, it returns true, writes the public value to |out_public_key|, - // and sets |*out_secret| the shared secret. On failure, it returns false and - // sets |*out_alert| to an alert to send to the peer. + // and sets |*out_secret| to the shared secret. On failure, it returns false + // and sets |*out_alert| to an alert to send to the peer. // // The default implementation calls |Offer| and then |Finish|, assuming a key // exchange protocol where the peers are symmetric. @@ -986,7 +986,7 @@ class SSLKeyShare { // Finish performs a key exchange against the |peer_key| generated by // |Accept|. On success, it returns true and sets |*out_secret| to the shared - // secret. On failure, it returns zero and sets |*out_alert| to an alert to + // secret. On failure, it returns false and sets |*out_alert| to an alert to // send to the peer. virtual bool Finish(Array *out_secret, uint8_t *out_alert, Span peer_key) PURE_VIRTUAL; @@ -1436,8 +1436,10 @@ struct SSL_HANDSHAKE { // error, if |wait| is |ssl_hs_error|, is the error the handshake failed on. UniquePtr error; - // key_share is the current key exchange instance. - UniquePtr key_share; + // key_shares are the current key exchange instances. The second is only used + // as a client if we believe that we should offer two key shares in a + // ClientHello. + UniquePtr key_shares[2]; // transcript is the current handshake transcript. SSLTranscript transcript; diff --git a/ssl/ssl_key_share.cc b/ssl/ssl_key_share.cc index 55c74633..108ea6a9 100644 --- a/ssl/ssl_key_share.cc +++ b/ssl/ssl_key_share.cc @@ -24,8 +24,10 @@ #include #include #include +#include #include #include +#include #include "internal.h" #include "../crypto/internal.h" @@ -207,12 +209,104 @@ class X25519KeyShare : public SSLKeyShare { uint8_t private_key_[32]; }; +class CECPQ2KeyShare : public SSLKeyShare { + public: + CECPQ2KeyShare() {} + + uint16_t GroupID() const override { return SSL_CURVE_CECPQ2; } + + bool Offer(CBB *out) override { + uint8_t x25519_public_key[32]; + X25519_keypair(x25519_public_key, x25519_private_key_); + + uint8_t hrss_entropy[HRSS_GENERATE_KEY_BYTES]; + RAND_bytes(hrss_entropy, sizeof(hrss_entropy)); + HRSS_generate_key(&hrss_public_key_, &hrss_private_key_, hrss_entropy); + + uint8_t hrss_public_key_bytes[HRSS_PUBLIC_KEY_BYTES]; + HRSS_marshal_public_key(hrss_public_key_bytes, &hrss_public_key_); + + if (!CBB_add_bytes(out, x25519_public_key, sizeof(x25519_public_key)) || + !CBB_add_bytes(out, hrss_public_key_bytes, + sizeof(hrss_public_key_bytes))) { + return false; + } + + return true; + }; + + bool Accept(CBB *out_public_key, Array *out_secret, + uint8_t *out_alert, Span peer_key) override { + Array secret; + if (!secret.Init(32 + HRSS_KEY_BYTES)) { + OPENSSL_PUT_ERROR(SSL, ERR_R_MALLOC_FAILURE); + return false; + } + + uint8_t x25519_public_key[32]; + X25519_keypair(x25519_public_key, x25519_private_key_); + + HRSS_public_key peer_public_key; + if (peer_key.size() != 32 + HRSS_PUBLIC_KEY_BYTES || + !HRSS_parse_public_key(&peer_public_key, peer_key.data() + 32) || + !X25519(secret.data(), x25519_private_key_, peer_key.data())) { + *out_alert = SSL_AD_DECODE_ERROR; + OPENSSL_PUT_ERROR(SSL, SSL_R_BAD_ECPOINT); + return false; + } + + uint8_t ciphertext[HRSS_CIPHERTEXT_BYTES]; + uint8_t entropy[HRSS_ENCAP_BYTES]; + RAND_bytes(entropy, sizeof(entropy)); + HRSS_encap(ciphertext, secret.data() + 32, &peer_public_key, entropy); + + if (!CBB_add_bytes(out_public_key, x25519_public_key, + sizeof(x25519_public_key)) || + !CBB_add_bytes(out_public_key, ciphertext, sizeof(ciphertext))) { + return false; + } + + *out_secret = std::move(secret); + return true; + } + + bool Finish(Array *out_secret, uint8_t *out_alert, + Span peer_key) override { + *out_alert = SSL_AD_INTERNAL_ERROR; + + Array secret; + if (!secret.Init(32 + HRSS_KEY_BYTES)) { + OPENSSL_PUT_ERROR(SSL, ERR_R_MALLOC_FAILURE); + return false; + } + + if (peer_key.size() != 32 + HRSS_CIPHERTEXT_BYTES || + !X25519(secret.data(), x25519_private_key_, peer_key.data())) { + *out_alert = SSL_AD_DECODE_ERROR; + OPENSSL_PUT_ERROR(SSL, SSL_R_BAD_ECPOINT); + return false; + } + + HRSS_decap(secret.data() + 32, &hrss_public_key_, &hrss_private_key_, + peer_key.data() + 32, peer_key.size() - 32); + + *out_secret = std::move(secret); + return true; + }; + + private: + uint8_t x25519_private_key_[32]; + HRSS_public_key hrss_public_key_; + HRSS_private_key hrss_private_key_; +}; + CONSTEXPR_ARRAY NamedGroup kNamedGroups[] = { {NID_secp224r1, SSL_CURVE_SECP224R1, "P-224", "secp224r1"}, {NID_X9_62_prime256v1, SSL_CURVE_SECP256R1, "P-256", "prime256v1"}, {NID_secp384r1, SSL_CURVE_SECP384R1, "P-384", "secp384r1"}, {NID_secp521r1, SSL_CURVE_SECP521R1, "P-521", "secp521r1"}, {NID_X25519, SSL_CURVE_X25519, "X25519", "x25519"}, + {NID_CECPQ2, SSL_CURVE_CECPQ2, "CECPQ2", "CECPQ2"}, }; } // namespace @@ -237,6 +331,8 @@ UniquePtr SSLKeyShare::Create(uint16_t group_id) { New(NID_secp521r1, SSL_CURVE_SECP521R1)); case SSL_CURVE_X25519: return UniquePtr(New()); + case SSL_CURVE_CECPQ2: + return UniquePtr(New()); default: return nullptr; } diff --git a/ssl/ssl_test.cc b/ssl/ssl_test.cc index 470379c0..705528b3 100644 --- a/ssl/ssl_test.cc +++ b/ssl/ssl_test.cc @@ -394,6 +394,11 @@ static const CurveTest kCurveTests[] = { "P-256", { SSL_CURVE_SECP256R1 }, }, + { + "P-256:CECPQ2", + { SSL_CURVE_SECP256R1, SSL_CURVE_CECPQ2 }, + }, + { "P-256:P-384:P-521:X25519", { diff --git a/ssl/t1_lib.cc b/ssl/t1_lib.cc index 00c796ad..5e65f819 100644 --- a/ssl/t1_lib.cc +++ b/ssl/t1_lib.cc @@ -292,10 +292,23 @@ static const uint16_t kDefaultGroups[] = { SSL_CURVE_SECP384R1, }; +// TLS 1.3 servers will pick CECPQ2 if offered by a client, but it's not enabled +// by default for clients. +static const uint16_t kDefaultGroupsServer[] = { + // CECPQ2 is not yet enabled by default. + // SSL_CURVE_CECPQ2, + SSL_CURVE_X25519, + SSL_CURVE_SECP256R1, + SSL_CURVE_SECP384R1, +};; + Span tls1_get_grouplist(const SSL_HANDSHAKE *hs) { if (!hs->config->supported_group_list.empty()) { return hs->config->supported_group_list; } + if (hs->ssl->server) { + return Span(kDefaultGroupsServer); + } return Span(kDefaultGroups); } @@ -324,7 +337,11 @@ bool tls1_get_shared_group(SSL_HANDSHAKE *hs, uint16_t *out_group_id) { for (uint16_t pref_group : pref) { for (uint16_t supp_group : supp) { - if (pref_group == supp_group) { + if (pref_group == supp_group && + // CECPQ2 doesn't fit in the u8-length-prefixed ECPoint field in TLS + // 1.2 and below. + (ssl_protocol_version(ssl) >= TLS1_3_VERSION || + pref_group != SSL_CURVE_CECPQ2)) { *out_group_id = pref_group; return true; } @@ -386,6 +403,12 @@ bool tls1_set_curves_list(Array *out_group_ids, const char *curves) { } bool tls1_check_group_id(const SSL_HANDSHAKE *hs, uint16_t group_id) { + if (group_id == SSL_CURVE_CECPQ2 && + ssl_protocol_version(hs->ssl) < TLS1_3_VERSION) { + // CECPQ2 requires TLS 1.3. + return false; + } + for (uint16_t supported : tls1_get_grouplist(hs)) { if (supported == group_id) { return true; @@ -2144,6 +2167,7 @@ static bool ext_key_share_add_clienthello(SSL_HANDSHAKE *hs, CBB *out) { } uint16_t group_id = hs->retry_group; + uint16_t second_group_id = 0; if (hs->received_hello_retry_request) { // We received a HelloRetryRequest without a new curve, so there is no new // share to append. Leave |hs->key_share| as-is. @@ -2174,19 +2198,38 @@ static bool ext_key_share_add_clienthello(SSL_HANDSHAKE *hs, CBB *out) { } group_id = groups[0]; + + if (group_id == SSL_CURVE_CECPQ2 && groups.size() >= 2) { + // CECPQ2 is not sent as the only initial key share. We'll include the + // 2nd preference group too to avoid round-trips. + second_group_id = groups[1]; + assert(second_group_id != group_id); + } } - hs->key_share = SSLKeyShare::Create(group_id); CBB key_exchange; - if (!hs->key_share || + hs->key_shares[0] = SSLKeyShare::Create(group_id); + if (!hs->key_shares[0] || !CBB_add_u16(&kse_bytes, group_id) || !CBB_add_u16_length_prefixed(&kse_bytes, &key_exchange) || - !hs->key_share->Offer(&key_exchange) || + !hs->key_shares[0]->Offer(&key_exchange) || !CBB_flush(&kse_bytes)) { return false; } - // Save the contents of the extension to repeat it in the second ClientHello. + if (second_group_id != 0) { + hs->key_shares[1] = SSLKeyShare::Create(second_group_id); + if (!hs->key_shares[1] || + !CBB_add_u16(&kse_bytes, second_group_id) || + !CBB_add_u16_length_prefixed(&kse_bytes, &key_exchange) || + !hs->key_shares[1]->Offer(&key_exchange) || + !CBB_flush(&kse_bytes)) { + return false; + } + } + + // Save the contents of the extension to repeat it in the second + // ClientHello. if (!hs->received_hello_retry_request && !hs->key_share_bytes.CopyFrom( MakeConstSpan(CBB_data(&kse_bytes), CBB_len(&kse_bytes)))) { @@ -2209,19 +2252,24 @@ bool ssl_ext_key_share_parse_serverhello(SSL_HANDSHAKE *hs, return false; } - if (hs->key_share->GroupID() != group_id) { - *out_alert = SSL_AD_ILLEGAL_PARAMETER; - OPENSSL_PUT_ERROR(SSL, SSL_R_WRONG_CURVE); - return false; + SSLKeyShare *key_share = hs->key_shares[0].get(); + if (key_share->GroupID() != group_id) { + if (!hs->key_shares[1] || hs->key_shares[1]->GroupID() != group_id) { + *out_alert = SSL_AD_ILLEGAL_PARAMETER; + OPENSSL_PUT_ERROR(SSL, SSL_R_WRONG_CURVE); + return false; + } + key_share = hs->key_shares[1].get(); } - if (!hs->key_share->Finish(out_secret, out_alert, peer_key)) { + if (!key_share->Finish(out_secret, out_alert, peer_key)) { *out_alert = SSL_AD_INTERNAL_ERROR; return false; } hs->new_session->group_id = group_id; - hs->key_share.reset(); + hs->key_shares[0].reset(); + hs->key_shares[1].reset(); return true; } @@ -2389,6 +2437,10 @@ static bool ext_supported_groups_add_clienthello(SSL_HANDSHAKE *hs, CBB *out) { } for (uint16_t group : tls1_get_grouplist(hs)) { + if (group == SSL_CURVE_CECPQ2 && + hs->max_version < TLS1_3_VERSION) { + continue; + } if (!CBB_add_u16(&groups_bytes, group)) { return false; } diff --git a/ssl/test/bssl_shim.cc b/ssl/test/bssl_shim.cc index 675a08a0..3632fc58 100644 --- a/ssl/test/bssl_shim.cc +++ b/ssl/test/bssl_shim.cc @@ -649,7 +649,6 @@ static bool DoConnection(bssl::UniquePtr *out_session, SSL_set_connect_state(ssl.get()); } - int sock = Connect(config->port); if (sock == -1) { return false; diff --git a/ssl/test/runner/cipher_suites.go b/ssl/test/runner/cipher_suites.go index f4c59006..3246f0b7 100644 --- a/ssl/test/runner/cipher_suites.go +++ b/ssl/test/runner/cipher_suites.go @@ -26,7 +26,7 @@ type keyAgreement interface { // In the case that the key agreement protocol doesn't use a // ServerKeyExchange message, generateServerKeyExchange can return nil, // nil. - generateServerKeyExchange(*Config, *Certificate, *clientHelloMsg, *serverHelloMsg) (*serverKeyExchangeMsg, error) + generateServerKeyExchange(*Config, *Certificate, *clientHelloMsg, *serverHelloMsg, uint16) (*serverKeyExchangeMsg, error) processClientKeyExchange(*Config, *Certificate, *clientKeyExchangeMsg, uint16) ([]byte, error) // On the client side, the next two methods are called in order. diff --git a/ssl/test/runner/common.go b/ssl/test/runner/common.go index 73b8889e..d99518c9 100644 --- a/ssl/test/runner/common.go +++ b/ssl/test/runner/common.go @@ -163,6 +163,7 @@ const ( CurveP384 CurveID = 24 CurveP521 CurveID = 25 CurveX25519 CurveID = 29 + CurveCECPQ2 CurveID = 16696 ) // TLS Elliptic Curve Point Formats @@ -1645,6 +1646,18 @@ type ProtocolBugs struct { // ExpectJDK11DowngradeRandom is whether the client should expect the // server to send the JDK 11 downgrade signal. ExpectJDK11DowngradeRandom bool + + // FailIfHelloRetryRequested causes a handshake failure if a server requests a + // hello retry. + FailIfHelloRetryRequested bool + + // FailedIfCECPQ2Offered will cause a server to reject a ClientHello if CECPQ2 + // is supported. + FailIfCECPQ2Offered bool + + // ExpectKeyShares, if not nil, lists (in order) the curves that a ClientHello + // should have key shares for. + ExpectedKeyShares []CurveID } func (c *Config) serverInit() { @@ -1724,7 +1737,7 @@ func (c *Config) maxVersion(isDTLS bool) uint16 { return ret } -var defaultCurvePreferences = []CurveID{CurveX25519, CurveP256, CurveP384, CurveP521} +var defaultCurvePreferences = []CurveID{CurveCECPQ2, CurveX25519, CurveP256, CurveP384, CurveP521} func (c *Config) curvePreferences() []CurveID { if c == nil || len(c.CurvePreferences) == 0 { diff --git a/ssl/test/runner/handshake_client.go b/ssl/test/runner/handshake_client.go index ab1f4dd2..5234462d 100644 --- a/ssl/test/runner/handshake_client.go +++ b/ssl/test/runner/handshake_client.go @@ -549,6 +549,9 @@ NextCipherSuite: helloRetryRequest, haveHelloRetryRequest := msg.(*helloRetryRequestMsg) var secondHelloBytes []byte if haveHelloRetryRequest { + if c.config.Bugs.FailIfHelloRetryRequested { + return errors.New("tls: unexpected HelloRetryRequest") + } // Explicitly read the ChangeCipherSpec now; it should // be attached to the first flight, not the second flight. if err := c.readTLS13ChangeCipherSpec(); err != nil { diff --git a/ssl/test/runner/handshake_server.go b/ssl/test/runner/handshake_server.go index 6a752421..5486342a 100644 --- a/ssl/test/runner/handshake_server.go +++ b/ssl/test/runner/handshake_server.go @@ -208,6 +208,26 @@ func (hs *serverHandshakeState) readClientHello() error { } } + if config.Bugs.FailIfCECPQ2Offered { + for _, offeredCurve := range hs.clientHello.supportedCurves { + if offeredCurve == CurveCECPQ2 { + return errors.New("tls: CECPQ2 was offered") + } + } + } + + if expected := config.Bugs.ExpectedKeyShares; expected != nil { + if len(expected) != len(hs.clientHello.keyShares) { + return fmt.Errorf("tls: expected %d key shares, but found %d", len(expected), len(hs.clientHello.keyShares)) + } + + for i, group := range expected { + if found := hs.clientHello.keyShares[i].group; found != group { + return fmt.Errorf("tls: key share #%d is for group %d, not %d", i, found, group) + } + } + } + c.clientVersion = hs.clientHello.vers // Use the versions extension if supplied, otherwise use the legacy ClientHello version. @@ -1212,6 +1232,11 @@ func (hs *serverHandshakeState) processClientHello() (isResume bool, err error) preferredCurves := config.curvePreferences() Curves: for _, curve := range hs.clientHello.supportedCurves { + if curve == CurveCECPQ2 && c.vers < VersionTLS13 { + // CECPQ2 is TLS 1.3-only. + continue + } + for _, supported := range preferredCurves { if supported == curve { supportedCurve = true @@ -1621,7 +1646,7 @@ func (hs *serverHandshakeState) doFullHandshake() error { } keyAgreement := hs.suite.ka(c.vers) - skx, err := keyAgreement.generateServerKeyExchange(config, hs.cert, hs.clientHello, hs.hello) + skx, err := keyAgreement.generateServerKeyExchange(config, hs.cert, hs.clientHello, hs.hello, c.vers) if err != nil { c.sendAlert(alertHandshakeFailure) return err diff --git a/ssl/test/runner/hrss/hrss.go b/ssl/test/runner/hrss/hrss.go new file mode 100644 index 00000000..ebda6567 --- /dev/null +++ b/ssl/test/runner/hrss/hrss.go @@ -0,0 +1,1230 @@ +package hrss + +import ( + "crypto/hmac" + "crypto/sha256" + "crypto/subtle" + "encoding/binary" + "io" + "math/bits" +) + +const ( + PublicKeySize = modQBytes + CiphertextSize = modQBytes + 32 +) + +const ( + N = 701 + Q = 8192 + mod3Bytes = 140 + modQBytes = 1138 +) + +const ( + bitsPerWord = bits.UintSize + wordsPerPoly = (N + bitsPerWord - 1) / bitsPerWord + fullWordsPerPoly = N / bitsPerWord + bitsInLastWord = N % bitsPerWord +) + +// poly3 represents a degree-N polynomial over GF(3). Each coefficient is +// bitsliced across the |s| and |a| arrays, like this: +// +// s | a | value +// ----------------- +// 0 | 0 | 0 +// 0 | 1 | 1 +// 1 | 0 | 2 (aka -1) +// 1 | 1 | +// +// ('s' is for sign, and 'a' is just a letter.) +// +// Once bitsliced as such, the following circuits can be used to implement +// addition and multiplication mod 3: +// +// (s3, a3) = (s1, a1) × (s2, a2) +// s3 = (s2 ∧ a1) ⊕ (s1 ∧ a2) +// a3 = (s1 ∧ s2) ⊕ (a1 ∧ a2) +// +// (s3, a3) = (s1, a1) + (s2, a2) +// t1 = ~(s1 ∨ a1) +// t2 = ~(s2 ∨ a2) +// s3 = (a1 ∧ a2) ⊕ (t1 ∧ s2) ⊕ (t2 ∧ s1) +// a3 = (s1 ∧ s2) ⊕ (t1 ∧ a2) ⊕ (t2 ∧ a1) +// +// Negating a value just involves swapping s and a. +type poly3 struct { + s [wordsPerPoly]uint + a [wordsPerPoly]uint +} + +func (p *poly3) trim() { + p.s[wordsPerPoly-1] &= (1 << bitsInLastWord) - 1 + p.a[wordsPerPoly-1] &= (1 << bitsInLastWord) - 1 +} + +func (p *poly3) zero() { + for i := range p.a { + p.s[i] = 0 + p.a[i] = 0 + } +} + +func (p *poly3) fromDiscrete(in *poly) { + var shift uint + s := p.s[:] + a := p.a[:] + s[0] = 0 + a[0] = 0 + + for _, v := range in { + s[0] >>= 1 + s[0] |= uint((v>>1)&1) << (bitsPerWord - 1) + a[0] >>= 1 + a[0] |= uint(v&1) << (bitsPerWord - 1) + shift++ + if shift == bitsPerWord { + s = s[1:] + a = a[1:] + s[0] = 0 + a[0] = 0 + shift = 0 + } + } + + a[0] >>= bitsPerWord - shift + s[0] >>= bitsPerWord - shift +} + +func (p *poly3) fromModQ(in *poly) int { + var shift uint + s := p.s[:] + a := p.a[:] + s[0] = 0 + a[0] = 0 + ok := 1 + + for _, v := range in { + vMod3, vOk := modQToMod3(v) + ok &= vOk + + s[0] >>= 1 + s[0] |= uint((vMod3>>1)&1) << (bitsPerWord - 1) + a[0] >>= 1 + a[0] |= uint(vMod3&1) << (bitsPerWord - 1) + shift++ + if shift == bitsPerWord { + s = s[1:] + a = a[1:] + s[0] = 0 + a[0] = 0 + shift = 0 + } + } + + a[0] >>= bitsPerWord - shift + s[0] >>= bitsPerWord - shift + + return ok +} + +func (p *poly3) fromDiscreteMod3(in *poly) { + var shift uint + s := p.s[:] + a := p.a[:] + s[0] = 0 + a[0] = 0 + + for _, v := range in { + // This duplicates the 13th bit upwards to the top of the + // uint16, essentially treating it as a sign bit and converting + // into a signed int16. The signed value is reduced mod 3, + // yeilding {-2, -1, 0, 1, 2}. + v = uint16((int16(v<<3)>>3)%3) & 7 + + // We want to map v thus: + // {-2, -1, 0, 1, 2} -> {1, 2, 0, 1, 2}. We take the bottom + // three bits and then the constants below, when shifted by + // those three bits, perform the required mapping. + s[0] >>= 1 + s[0] |= (0xbc >> v) << (bitsPerWord - 1) + a[0] >>= 1 + a[0] |= (0x7a >> v) << (bitsPerWord - 1) + shift++ + if shift == bitsPerWord { + s = s[1:] + a = a[1:] + s[0] = 0 + a[0] = 0 + shift = 0 + } + } + + a[0] >>= bitsPerWord - shift + s[0] >>= bitsPerWord - shift +} + +func (p *poly3) marshal(out []byte) { + s := p.s[:] + a := p.a[:] + sw := s[0] + aw := a[0] + var shift int + + for i := 0; i < 700; i += 5 { + acc, scale := 0, 1 + for j := 0; j < 5; j++ { + v := int(aw&1) | int(sw&1)<<1 + acc += scale * v + scale *= 3 + + shift++ + if shift == bitsPerWord { + s = s[1:] + a = a[1:] + sw = s[0] + aw = a[0] + shift = 0 + } else { + sw >>= 1 + aw >>= 1 + } + } + + out[0] = byte(acc) + out = out[1:] + } +} + +func (p *poly) fromMod2(in *poly2) { + var shift uint + words := in[:] + word := words[0] + + for i := range p { + p[i] = uint16(word & 1) + word >>= 1 + shift++ + if shift == bitsPerWord { + words = words[1:] + word = words[0] + shift = 0 + } + } +} + +func (p *poly) fromMod3(in *poly3) { + var shift uint + s := in.s[:] + a := in.a[:] + sw := s[0] + aw := a[0] + + for i := range p { + p[i] = uint16(aw&1 | (sw&1)<<1) + aw >>= 1 + sw >>= 1 + shift++ + if shift == bitsPerWord { + a = a[1:] + s = s[1:] + aw = a[0] + sw = s[0] + shift = 0 + } + } +} + +func (p *poly) fromMod3ToModQ(in *poly3) { + var shift uint + s := in.s[:] + a := in.a[:] + sw := s[0] + aw := a[0] + + for i := range p { + p[i] = mod3ToModQ(uint16(aw&1 | (sw&1)<<1)) + aw >>= 1 + sw >>= 1 + shift++ + if shift == bitsPerWord { + a = a[1:] + s = s[1:] + aw = a[0] + sw = s[0] + shift = 0 + } + } +} + +func lsbToAll(v uint) uint { + return uint(int(v<<(bitsPerWord-1)) >> (bitsPerWord - 1)) +} + +func (p *poly3) mulConst(ms, ma uint) { + ms = lsbToAll(ms) + ma = lsbToAll(ma) + + for i := range p.a { + p.s[i], p.a[i] = (ma&p.s[i])^(ms&p.a[i]), (ma&p.a[i])^(ms&p.s[i]) + } +} + +func cmovWords(out, in *[wordsPerPoly]uint, mov uint) { + for i := range out { + out[i] = (out[i] & ^mov) | (in[i] & mov) + } +} + +func rotWords(out, in *[wordsPerPoly]uint, bits uint) { + start := bits / bitsPerWord + n := (N - bits) / bitsPerWord + + for i := uint(0); i < n; i++ { + out[i] = in[start+i] + } + + carry := in[wordsPerPoly-1] + + for i := uint(0); i < start; i++ { + out[n+i] = carry | in[i]<> (bitsPerWord - bitsInLastWord) + } + + out[wordsPerPoly-1] = carry +} + +// rotBits right-rotates the bits in |in|. bits must be a non-zero power of two +// and less than bitsPerWord. +func rotBits(out, in *[wordsPerPoly]uint, bits uint) { + if (bits == 0 || (bits & (bits - 1)) != 0 || bits > bitsPerWord/2 || bitsInLastWord < bitsPerWord/2) { + panic("internal error"); + } + + carry := in[wordsPerPoly-1] << (bitsPerWord - bits) + + for i := wordsPerPoly - 2; i >= 0; i-- { + out[i] = carry | in[i]>>bits + carry = in[i] << (bitsPerWord - bits) + } + + out[wordsPerPoly-1] = carry>>(bitsPerWord-bitsInLastWord) | in[wordsPerPoly-1]>>bits +} + +func (p *poly3) rotWords(bits uint, in *poly3) { + rotWords(&p.s, &in.s, bits) + rotWords(&p.a, &in.a, bits) +} + +func (p *poly3) rotBits(bits uint, in *poly3) { + rotBits(&p.s, &in.s, bits) + rotBits(&p.a, &in.a, bits) +} + +func (p *poly3) cmov(in *poly3, mov uint) { + cmovWords(&p.s, &in.s, mov) + cmovWords(&p.a, &in.a, mov) +} + +func (p *poly3) rot(bits uint) { + if bits > N { + panic("invalid") + } + var shifted poly3 + + shift := uint(9) + for ; (1 << shift) >= bitsPerWord; shift-- { + shifted.rotWords(1<>shift)) + } + for ; shift < 9; shift-- { + shifted.rotBits(1<>shift)) + } +} + +func (p *poly3) fmadd(ms, ma uint, in *poly3) { + ms = lsbToAll(ms) + ma = lsbToAll(ma) + + for i := range p.a { + products := (ma & in.s[i]) ^ (ms & in.a[i]) + producta := (ma & in.a[i]) ^ (ms & in.s[i]) + + ns1Ana1 := ^p.s[i] & ^p.a[i] + ns2Ana2 := ^products & ^producta + + p.s[i], p.a[i] = (p.a[i]&producta)^(ns1Ana1&products)^(p.s[i]&ns2Ana2), (p.s[i]&products)^(ns1Ana1&producta)^(p.a[i]&ns2Ana2) + } +} + +func (p *poly3) modPhiN() { + factora := uint(int(p.s[wordsPerPoly-1]<<(bitsPerWord-bitsInLastWord)) >> (bitsPerWord - 1)) + factors := uint(int(p.a[wordsPerPoly-1]<<(bitsPerWord-bitsInLastWord)) >> (bitsPerWord - 1)) + ns2Ana2 := ^factors & ^factora + + for i := range p.s { + ns1Ana1 := ^p.s[i] & ^p.a[i] + p.s[i], p.a[i] = (p.a[i]&factora)^(ns1Ana1&factors)^(p.s[i]&ns2Ana2), (p.s[i]&factors)^(ns1Ana1&factora)^(p.a[i]&ns2Ana2) + } +} + +func (p *poly3) cswap(other *poly3, swap uint) { + for i := range p.s { + sums := swap & (p.s[i] ^ other.s[i]) + p.s[i] ^= sums + other.s[i] ^= sums + + suma := swap & (p.a[i] ^ other.a[i]) + p.a[i] ^= suma + other.a[i] ^= suma + } +} + +func (p *poly3) mulx() { + carrys := (p.s[wordsPerPoly-1] >> (bitsInLastWord - 1)) & 1 + carrya := (p.a[wordsPerPoly-1] >> (bitsInLastWord - 1)) & 1 + + for i := range p.s { + outCarrys := p.s[i] >> (bitsPerWord - 1) + outCarrya := p.a[i] >> (bitsPerWord - 1) + p.s[i] <<= 1 + p.a[i] <<= 1 + p.s[i] |= carrys + p.a[i] |= carrya + carrys = outCarrys + carrya = outCarrya + } +} + +func (p *poly3) divx() { + var carrys, carrya uint + + for i := len(p.s) - 1; i >= 0; i-- { + outCarrys := p.s[i] & 1 + outCarrya := p.a[i] & 1 + p.s[i] >>= 1 + p.a[i] >>= 1 + p.s[i] |= carrys << (bitsPerWord - 1) + p.a[i] |= carrya << (bitsPerWord - 1) + carrys = outCarrys + carrya = outCarrya + } +} + +type poly2 [wordsPerPoly]uint + +func (p *poly2) fromDiscrete(in *poly) { + var shift uint + words := p[:] + words[0] = 0 + + for _, v := range in { + words[0] >>= 1 + words[0] |= uint(v&1) << (bitsPerWord - 1) + shift++ + if shift == bitsPerWord { + words = words[1:] + words[0] = 0 + shift = 0 + } + } + + words[0] >>= bitsPerWord - shift +} + +func (p *poly2) setPhiN() { + for i := range p { + p[i] = ^uint(0) + } + p[wordsPerPoly-1] &= (1 << bitsInLastWord) - 1 +} + +func (p *poly2) cswap(other *poly2, swap uint) { + for i := range p { + sum := swap & (p[i] ^ other[i]) + p[i] ^= sum + other[i] ^= sum + } +} + +func (p *poly2) fmadd(m uint, in *poly2) { + m = ^(m - 1) + + for i := range p { + p[i] ^= in[i] & m + } +} + +func (p *poly2) lshift1() { + var carry uint + for i := range p { + nextCarry := p[i] >> (bitsPerWord - 1) + p[i] <<= 1 + p[i] |= carry + carry = nextCarry + } +} + +func (p *poly2) rshift1() { + var carry uint + for i := len(p) - 1; i >= 0; i-- { + nextCarry := p[i] & 1 + p[i] >>= 1 + p[i] |= carry << (bitsPerWord - 1) + carry = nextCarry + } +} + +func (p *poly2) rot(bits uint) { + if bits > N { + panic("invalid") + } + var shifted [wordsPerPoly]uint + out := (*[wordsPerPoly]uint)(p) + + shift := uint(9) + for ; (1 << shift) >= bitsPerWord; shift-- { + rotWords(&shifted, out, 1<>shift)) + } + for ; shift < 9; shift-- { + rotBits(&shifted, out, 1<>shift)) + } +} + +type poly [N]uint16 + +func (in *poly) marshal(out []byte) { + p := in[:] + + for len(p) >= 8 { + out[0] = byte(p[0]) + out[1] = byte(p[0]>>8) | byte((p[1]&0x07)<<5) + out[2] = byte(p[1] >> 3) + out[3] = byte(p[1]>>11) | byte((p[2]&0x3f)<<2) + out[4] = byte(p[2]>>6) | byte((p[3]&0x01)<<7) + out[5] = byte(p[3] >> 1) + out[6] = byte(p[3]>>9) | byte((p[4]&0x0f)<<4) + out[7] = byte(p[4] >> 4) + out[8] = byte(p[4]>>12) | byte((p[5]&0x7f)<<1) + out[9] = byte(p[5]>>7) | byte((p[6]&0x03)<<6) + out[10] = byte(p[6] >> 2) + out[11] = byte(p[6]>>10) | byte((p[7]&0x1f)<<3) + out[12] = byte(p[7] >> 5) + + p = p[8:] + out = out[13:] + } + + // There are four remaining values. + out[0] = byte(p[0]) + out[1] = byte(p[0]>>8) | byte((p[1]&0x07)<<5) + out[2] = byte(p[1] >> 3) + out[3] = byte(p[1]>>11) | byte((p[2]&0x3f)<<2) + out[4] = byte(p[2]>>6) | byte((p[3]&0x01)<<7) + out[5] = byte(p[3] >> 1) + out[6] = byte(p[3] >> 9) +} + +func (out *poly) unmarshal(in []byte) bool { + p := out[:] + for i := 0; i < 87; i++ { + p[0] = uint16(in[0]) | uint16(in[1]&0x1f)<<8 + p[1] = uint16(in[1]>>5) | uint16(in[2])<<3 | uint16(in[3]&3)<<11 + p[2] = uint16(in[3]>>2) | uint16(in[4]&0x7f)<<6 + p[3] = uint16(in[4]>>7) | uint16(in[5])<<1 | uint16(in[6]&0xf)<<9 + p[4] = uint16(in[6]>>4) | uint16(in[7])<<4 | uint16(in[8]&1)<<12 + p[5] = uint16(in[8]>>1) | uint16(in[9]&0x3f)<<7 + p[6] = uint16(in[9]>>6) | uint16(in[10])<<2 | uint16(in[11]&7)<<10 + p[7] = uint16(in[11]>>3) | uint16(in[12])<<5 + + p = p[8:] + in = in[13:] + } + + // There are four coefficients left over + p[0] = uint16(in[0]) | uint16(in[1]&0x1f)<<8 + p[1] = uint16(in[1]>>5) | uint16(in[2])<<3 | uint16(in[3]&3)<<11 + p[2] = uint16(in[3]>>2) | uint16(in[4]&0x7f)<<6 + p[3] = uint16(in[4]>>7) | uint16(in[5])<<1 | uint16(in[6]&0xf)<<9 + + if in[6]&0xf0 != 0 { + return false + } + + out[N-1] = 0 + var top int + for _, v := range out { + top += int(v) + } + + out[N-1] = uint16(-top) % Q + return true +} + +func (in *poly) marshalS3(out []byte) { + p := in[:] + for len(p) >= 5 { + out[0] = byte(p[0] + p[1]*3 + p[2]*9 + p[3]*27 + p[4]*81) + out = out[1:] + p = p[5:] + } +} + +func (out *poly) unmarshalS3(in []byte) bool { + p := out[:] + for i := 0; i < 140; i++ { + c := in[0] + if c >= 243 { + return false + } + p[0] = uint16(c % 3) + p[1] = uint16((c / 3) % 3) + p[2] = uint16((c / 9) % 3) + p[3] = uint16((c / 27) % 3) + p[4] = uint16((c / 81) % 3) + + p = p[5:] + in = in[1:] + } + + out[N-1] = 0 + return true +} + +func (p *poly) modPhiN() { + for i := range p { + p[i] = (p[i] + Q - p[N-1]) % Q + } +} + +func (out *poly) shortSample(in []byte) { + // b a result + // 00 00 00 + // 00 01 01 + // 00 10 10 + // 00 11 11 + // 01 00 10 + // 01 01 00 + // 01 10 01 + // 01 11 11 + // 10 00 01 + // 10 01 10 + // 10 10 00 + // 10 11 11 + // 11 00 11 + // 11 01 11 + // 11 10 11 + // 11 11 11 + + // 1111 1111 1100 1001 1101 0010 1110 0100 + // f f c 9 d 2 e 4 + const lookup = uint32(0xffc9d2e4) + + p := out[:] + for i := 0; i < 87; i++ { + v := binary.LittleEndian.Uint32(in) + v2 := (v & 0x55555555) + ((v >> 1) & 0x55555555) + for j := 0; j < 8; j++ { + p[j] = uint16(lookup >> ((v2 & 15) << 1) & 3) + v2 >>= 4 + } + p = p[8:] + in = in[4:] + } + + // There are four values remaining. + v := binary.LittleEndian.Uint32(in) + v2 := (v & 0x55555555) + ((v >> 1) & 0x55555555) + for j := 0; j < 4; j++ { + p[j] = uint16(lookup >> ((v2 & 15) << 1) & 3) + v2 >>= 4 + } + + out[N-1] = 0 +} + +func (out *poly) shortSamplePlus(in []byte) { + out.shortSample(in) + + var sum uint16 + for i := 0; i < N-1; i++ { + sum += mod3ResultToModQ(out[i] * out[i+1]) + } + + scale := 1 + (1 & (sum >> 12)) + for i := 0; i < len(out); i += 2 { + out[i] = (out[i] * scale) % 3 + } +} + +func mul(out, scratch, a, b []uint16) { + const schoolbookLimit = 32 + if len(a) < schoolbookLimit { + for i := 0; i < len(a)*2; i++ { + out[i] = 0 + } + for i := range a { + for j := range b { + out[i+j] += a[i] * b[j] + } + } + return + } + + lowLen := len(a) / 2 + highLen := len(a) - lowLen + aLow, aHigh := a[:lowLen], a[lowLen:] + bLow, bHigh := b[:lowLen], b[lowLen:] + + for i := 0; i < lowLen; i++ { + out[i] = aHigh[i] + aLow[i] + } + if highLen != lowLen { + out[lowLen] = aHigh[lowLen] + } + + for i := 0; i < lowLen; i++ { + out[highLen+i] = bHigh[i] + bLow[i] + } + if highLen != lowLen { + out[highLen+lowLen] = bHigh[lowLen] + } + + mul(scratch, scratch[2*highLen:], out[:highLen], out[highLen:highLen*2]) + mul(out[lowLen*2:], scratch[2*highLen:], aHigh, bHigh) + mul(out, scratch[2*highLen:], aLow, bLow) + + for i := 0; i < lowLen*2; i++ { + scratch[i] -= out[i] + out[lowLen*2+i] + } + if lowLen != highLen { + scratch[lowLen*2] -= out[lowLen*4] + } + + for i := 0; i < 2*highLen; i++ { + out[lowLen+i] += scratch[i] + } +} + +func (out *poly) mul(a, b *poly) { + var prod, scratch [2 * N]uint16 + mul(prod[:], scratch[:], a[:], b[:]) + for i := range out { + out[i] = (prod[i] + prod[i+N]) % Q + } +} + +func (p3 *poly3) mulMod3(x, y *poly3) { + // (𝑥^n - 1) is a multiple of Φ(N) so we can work mod (𝑥^n - 1) here and + // (reduce mod Φ(N) afterwards. + x3 := *x + y3 := *y + s := x3.s[:] + a := x3.a[:] + sw := s[0] + aw := a[0] + p3.zero() + var shift uint + for i := 0; i < N; i++ { + p3.fmadd(sw, aw, &y3) + sw >>= 1 + aw >>= 1 + shift++ + if shift == bitsPerWord { + s = s[1:] + a = a[1:] + sw = s[0] + aw = a[0] + shift = 0 + } + y3.mulx() + } + p3.modPhiN() +} + +// mod3ToModQ maps {0, 1, 2, 3} to {0, 1, Q-1, 0xffff} +// The case of n == 3 should never happen but is included so that modQToMod3 +// can easily catch invalid inputs. +func mod3ToModQ(n uint16) uint16 { + return uint16(uint64(0xffff1fff00010000) >> (16 * n)) +} + +// modQToMod3 maps {0, 1, Q-1} to {(0, 0), (0, 1), (1, 0)} and also returns an int +// which is one if the input is in range and zero otherwise. +func modQToMod3(n uint16) (uint16, int) { + result := (n&3 - (n>>1)&1) + return result, subtle.ConstantTimeEq(int32(mod3ToModQ(result)), int32(n)) +} + +// mod3ResultToModQ maps {0, 1, 2, 4} to {0, 1, Q-1, 1} +func mod3ResultToModQ(n uint16) uint16 { + return ((((uint16(0x13) >> n) & 1) - 1) & 0x1fff) | ((uint16(0x12) >> n) & 1) + //shift := (uint(0x324) >> (2 * n)) & 3 + //return uint16(uint64(0x00011fff00010000) >> (16 * shift)) +} + +// mulXMinus1 sets out to a×(𝑥 - 1) mod (𝑥^n - 1) +func (out *poly) mulXMinus1() { + // Multiplying by (𝑥 - 1) means negating each coefficient and adding in + // the value of the previous one. + origOut700 := out[700] + + for i := N - 1; i > 0; i-- { + out[i] = (Q - out[i] + out[i-1]) % Q + } + out[0] = (Q - out[0] + origOut700) % Q +} + +func (out *poly) lift(a *poly) { + // We wish to calculate a/(𝑥-1) mod Φ(N) over GF(3), where Φ(N) is the + // Nth cyclotomic polynomial, i.e. 1 + 𝑥 + … + 𝑥^700 (since N is prime). + + // 1/(𝑥-1) has a fairly basic structure that we can exploit to speed this up: + // + // R. = PolynomialRing(GF(3)…) + // inv = R.cyclotomic_polynomial(1).inverse_mod(R.cyclotomic_polynomial(n)) + // list(inv)[:15] + // [1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2] + // + // This three-element pattern of coefficients repeats for the whole + // polynomial. + // + // Next define the overbar operator such that z̅ = z[0] + + // reverse(z[1:]). (Index zero of a polynomial here is the coefficient + // of the constant term. So index one is the coefficient of 𝑥 and so + // on.) + // + // A less odd way to define this is to see that z̅ negates the indexes, + // so z̅[0] = z[-0], z̅[1] = z[-1] and so on. + // + // The use of z̅ is that, when working mod (𝑥^701 - 1), vz[0] = , vz[1] = , …. (Where is the inner product: the sum + // of the point-wise products.) Although we calculated the inverse mod + // Φ(N), we can work mod (𝑥^N - 1) and reduce mod Φ(N) at the end. + // (That's because (𝑥^N - 1) is a multiple of Φ(N).) + // + // When working mod (𝑥^N - 1), multiplication by 𝑥 is a right-rotation + // of the list of coefficients. + // + // Thus we can consider what the pattern of z̅, 𝑥z̅, 𝑥^2z̅, … looks like: + // + // def reverse(xs): + // suffix = list(xs[1:]) + // suffix.reverse() + // return [xs[0]] + suffix + // + // def rotate(xs): + // return [xs[-1]] + xs[:-1] + // + // zoverbar = reverse(list(inv) + [0]) + // xzoverbar = rotate(reverse(list(inv) + [0])) + // x2zoverbar = rotate(rotate(reverse(list(inv) + [0]))) + // + // zoverbar[:15] + // [1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1] + // xzoverbar[:15] + // [0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0] + // x2zoverbar[:15] + // [2, 0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2] + // + // (For a formula for z̅, see lemma two of appendix B.) + // + // After the first three elements have been taken care of, all then have + // a repeating three-element cycle. The next value (𝑥^3z̅) involves + // three rotations of the first pattern, thus the three-element cycle + // lines up. However, the discontinuity in the first three elements + // obviously moves to a different position. Consider the difference + // between 𝑥^3z̅ and z̅: + // + // [x-y for (x,y) in zip(zoverbar, x3zoverbar)][:15] + // [0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + // + // This pattern of differences is the same for all elements, although it + // obviously moves right with the rotations. + // + // From this, we reach algorithm eight of appendix B. + + // Handle the first three elements of the inner products. + out[0] = a[0] + a[2] + out[1] = a[1] + out[2] = 2*a[0] + a[2] + + // Use the repeating pattern to complete the first three inner products. + for i := 3; i < 699; i += 3 { + out[0] += 2*a[i] + a[i+2] + out[1] += a[i] + 2*a[i+1] + out[2] += a[i+1] + 2*a[i+2] + } + + // Handle the fact that the three-element pattern doesn't fill the + // polynomial exactly (since 701 isn't a multiple of three). + out[2] += a[700] + out[0] += 2 * a[699] + out[1] += a[699] + 2*a[700] + + out[0] = out[0] % 3 + out[1] = out[1] % 3 + out[2] = out[2] % 3 + + // Calculate the remaining inner products by taking advantage of the + // fact that the pattern repeats every three cycles and the pattern of + // differences is moves with the rotation. + for i := 3; i < N; i++ { + // Add twice something is the same as subtracting when working + // mod 3. Doing it this way avoids underflow. Underflow is bad + // because "% 3" doesn't work correctly for negative numbers + // here since underflow will wrap to 2^16-1 and 2^16 isn't a + // multiple of three. + out[i] = (out[i-3] + 2*(a[i-2]+a[i-1]+a[i])) % 3 + } + + // Reduce mod Φ(N) by subtracting a multiple of out[700] from every + // element and convert to mod Q. (See above about adding twice as + // subtraction.) + v := out[700] * 2 + for i := range out { + out[i] = mod3ToModQ((out[i] + v) % 3) + } + + out.mulXMinus1() +} + +func (a *poly) cswap(b *poly, swap uint16) { + for i := range a { + sum := swap & (a[i] ^ b[i]) + a[i] ^= sum + b[i] ^= sum + } +} + +func lt(a, b uint) uint { + if a < b { + return ^uint(0) + } + return 0 +} + +func bsMul(s1, a1, s2, a2 uint) (s3, a3 uint) { + s3 = (a1 & s2) ^ (s1 & a2) + a3 = (a1 & a2) ^ (s1 & s2) + return +} + +func (out *poly3) invertMod3(in *poly3) { + // This algorithm follows algorithm 10 in the paper. (Although note that + // the paper appears to have a bug: k should start at zero, not one.) + // The best explanation for why it works is in the "Why it works" + // section of + // https://assets.onboardsecurity.com/static/downloads/NTRU/resources/NTRUTech014.pdf. + var k uint + degF, degG := uint(N-1), uint(N-1) + + var b, c, g poly3 + f := *in + + for i := range g.a { + g.a[i] = ^uint(0) + } + + b.a[0] = 1 + + var f0s, f0a uint + stillGoing := ^uint(0) + for i := 0; i < 2*(N-1)-1; i++ { + ss, sa := bsMul(f.s[0], f.a[0], g.s[0], g.a[0]) + ss, sa = sa&stillGoing&1, ss&stillGoing&1 + shouldSwap := ^uint(int((ss|sa)-1)>>(bitsPerWord-1)) & lt(degF, degG) + f.cswap(&g, shouldSwap) + b.cswap(&c, shouldSwap) + degF, degG = (degG&shouldSwap)|(degF & ^shouldSwap), (degF&shouldSwap)|(degG&^shouldSwap) + f.fmadd(ss, sa, &g) + b.fmadd(ss, sa, &c) + + f.divx() + f.s[wordsPerPoly-1] &= ((1 << bitsInLastWord) - 1) >> 1 + f.a[wordsPerPoly-1] &= ((1 << bitsInLastWord) - 1) >> 1 + c.mulx() + c.s[0] &= ^uint(1) + c.a[0] &= ^uint(1) + + degF-- + k += 1 & stillGoing + f0s = (stillGoing & f.s[0]) | (^stillGoing & f0s) + f0a = (stillGoing & f.a[0]) | (^stillGoing & f0a) + stillGoing = ^uint(int(degF-1) >> (bitsPerWord - 1)) + } + + k -= N & lt(N, k) + *out = b + out.rot(k) + out.mulConst(f0s, f0a) + out.modPhiN() +} + +func (out *poly) invertMod2(a *poly) { + // This algorithm follows mix of algorithm 10 in the paper and the first + // page of the PDF linked below. (Although note that the paper appears + // to have a bug: k should start at zero, not one.) The best explanation + // for why it works is in the "Why it works" section of + // https://assets.onboardsecurity.com/static/downloads/NTRU/resources/NTRUTech014.pdf. + var k uint + degF, degG := uint(N-1), uint(N-1) + + var f poly2 + f.fromDiscrete(a) + var b, c, g poly2 + g.setPhiN() + b[0] = 1 + + stillGoing := ^uint(0) + for i := 0; i < 2*(N-1)-1; i++ { + s := uint(f[0]&1) & stillGoing + shouldSwap := ^(s - 1) & lt(degF, degG) + f.cswap(&g, shouldSwap) + b.cswap(&c, shouldSwap) + degF, degG = (degG&shouldSwap)|(degF & ^shouldSwap), (degF&shouldSwap)|(degG&^shouldSwap) + f.fmadd(s, &g) + b.fmadd(s, &c) + + f.rshift1() + c.lshift1() + + degF-- + k += 1 & stillGoing + stillGoing = ^uint(int(degF-1) >> (bitsPerWord - 1)) + } + + k -= N & lt(N, k) + b.rot(k) + out.fromMod2(&b) +} + +func (out *poly) invert(origA *poly) { + // Inversion mod Q, which is done based on the result of inverting mod + // 2. See the NTRU paper, page three. + var a, tmp, tmp2, b poly + b.invertMod2(origA) + + // Negate a. + for i := range a { + a[i] = Q - origA[i] + } + + // We are working mod Q=2**13 and we need to iterate ceil(log_2(13)) + // times, which is four. + for i := 0; i < 4; i++ { + tmp.mul(&a, &b) + tmp[0] += 2 + tmp2.mul(&b, &tmp) + b = tmp2 + } + + *out = b +} + +type PublicKey struct { + h poly +} + +func ParsePublicKey(in []byte) (*PublicKey, bool) { + ret := new(PublicKey) + if !ret.h.unmarshal(in) { + return nil, false + } + return ret, true +} + +func (pub *PublicKey) Marshal() []byte { + ret := make([]byte, modQBytes) + pub.h.marshal(ret) + return ret +} + +func (pub *PublicKey) Encap(rand io.Reader) (ciphertext []byte, sharedKey []byte) { + var randBytes [352 + 352]byte + if _, err := io.ReadFull(rand, randBytes[:]); err != nil { + panic("rand failed") + } + + var m, r poly + m.shortSample(randBytes[:352]) + r.shortSample(randBytes[352:]) + + var mBytes, rBytes [mod3Bytes]byte + m.marshalS3(mBytes[:]) + r.marshalS3(rBytes[:]) + + h := sha256.New() + h.Write([]byte("confirmation hash\x00")) + h.Write(mBytes[:]) + h.Write(rBytes[:]) + confirmationDigest := h.Sum(nil) + + encrypted := pub.owf(&m, &r) + ciphertext = make([]byte, 0, len(encrypted)+len(confirmationDigest)) + ciphertext = append(ciphertext, encrypted...) + ciphertext = append(ciphertext, confirmationDigest...) + + h.Reset() + h.Write([]byte("shared key\x00")) + h.Write(mBytes[:]) + h.Write(rBytes[:]) + h.Write(ciphertext) + sharedKey = h.Sum(nil) + + return ciphertext, sharedKey +} + +func (pub *PublicKey) owf(m, r *poly) []byte { + for i := range r { + r[i] = mod3ToModQ(r[i]) + } + + var mq poly + mq.lift(m) + + var e poly + e.mul(r, &pub.h) + for i := range e { + e[i] = (e[i] + mq[i]) % Q + } + + ret := make([]byte, modQBytes) + e.marshal(ret[:]) + return ret +} + +type PrivateKey struct { + PublicKey + f, fp poly3 + hInv poly + hmacKey [32]byte +} + +func (priv *PrivateKey) Marshal() []byte { + var ret [2*mod3Bytes + modQBytes]byte + priv.f.marshal(ret[:]) + priv.fp.marshal(ret[mod3Bytes:]) + priv.h.marshal(ret[2*mod3Bytes:]) + return ret[:] +} + +func (priv *PrivateKey) Decap(ciphertext []byte) (sharedKey []byte, ok bool) { + if len(ciphertext) != modQBytes+32 { + return nil, false + } + + var e poly + if !e.unmarshal(ciphertext[:modQBytes]) { + return nil, false + } + + var f poly + f.fromMod3ToModQ(&priv.f) + + var v1, m poly + v1.mul(&e, &f) + + var v13 poly3 + v13.fromDiscreteMod3(&v1) + // Note: v13 is not reduced mod phi(n). + + var m3 poly3 + m3.mulMod3(&v13, &priv.fp) + m3.modPhiN() + m.fromMod3(&m3) + + var mLift, delta poly + mLift.lift(&m) + for i := range delta { + delta[i] = (e[i] - mLift[i] + Q) % Q + } + delta.mul(&delta, &priv.hInv) + delta.modPhiN() + + var r poly3 + allOk := r.fromModQ(&delta) + + var mBytes, rBytes [mod3Bytes]byte + m.marshalS3(mBytes[:]) + r.marshal(rBytes[:]) + + h := sha256.New() + h.Write([]byte("confirmation hash\x00")) + h.Write(mBytes[:]) + h.Write(rBytes[:]) + confirmationDigest := h.Sum(nil) + + var rPoly poly + rPoly.fromMod3(&r) + encrypted := priv.PublicKey.owf(&m, &rPoly) + expectedCiphertext := make([]byte, 0, len(encrypted)+len(confirmationDigest)) + expectedCiphertext = append(expectedCiphertext, encrypted...) + expectedCiphertext = append(expectedCiphertext, confirmationDigest...) + + allOk &= subtle.ConstantTimeCompare(ciphertext, expectedCiphertext) + + hmacHash := hmac.New(sha256.New, priv.hmacKey[:]) + hmacHash.Write(ciphertext) + hmacDigest := hmacHash.Sum(nil) + + h.Reset() + h.Write([]byte("shared key\x00")) + h.Write(mBytes[:]) + h.Write(rBytes[:]) + h.Write(ciphertext) + sharedKey = h.Sum(nil) + + mask := uint8(allOk - 1) + for i := range sharedKey { + sharedKey[i] = (sharedKey[i] & ^mask) | (hmacDigest[i] & mask) + } + + return sharedKey, true +} + +func GenerateKey(rand io.Reader) PrivateKey { + var randBytes [352 + 352]byte + if _, err := io.ReadFull(rand, randBytes[:]); err != nil { + panic("rand failed") + } + + var f poly + f.shortSamplePlus(randBytes[:352]) + var priv PrivateKey + priv.f.fromDiscrete(&f) + priv.fp.invertMod3(&priv.f) + + var g poly + g.shortSamplePlus(randBytes[352:]) + + var pgPhi1 poly + for i := range g { + pgPhi1[i] = mod3ToModQ(g[i]) + } + for i := range pgPhi1 { + pgPhi1[i] = (pgPhi1[i] * 3) % Q + } + pgPhi1.mulXMinus1() + + var fModQ poly + fModQ.fromMod3ToModQ(&priv.f) + + var pfgPhi1 poly + pfgPhi1.mul(&fModQ, &pgPhi1) + + var i poly + i.invert(&pfgPhi1) + + priv.h.mul(&i, &pgPhi1) + priv.h.mul(&priv.h, &pgPhi1) + + priv.hInv.mul(&i, &fModQ) + priv.hInv.mul(&priv.hInv, &fModQ) + + return priv +} diff --git a/ssl/test/runner/key_agreement.go b/ssl/test/runner/key_agreement.go index 791325cd..f40552d9 100644 --- a/ssl/test/runner/key_agreement.go +++ b/ssl/test/runner/key_agreement.go @@ -17,6 +17,7 @@ import ( "boringssl.googlesource.com/boringssl/ssl/test/runner/curve25519" "boringssl.googlesource.com/boringssl/ssl/test/runner/ed25519" + "boringssl.googlesource.com/boringssl/ssl/test/runner/hrss" ) type keyType int @@ -37,7 +38,7 @@ type rsaKeyAgreement struct { exportKey *rsa.PrivateKey } -func (ka *rsaKeyAgreement) generateServerKeyExchange(config *Config, cert *Certificate, clientHello *clientHelloMsg, hello *serverHelloMsg) (*serverKeyExchangeMsg, error) { +func (ka *rsaKeyAgreement) generateServerKeyExchange(config *Config, cert *Certificate, clientHello *clientHelloMsg, hello *serverHelloMsg, version uint16) (*serverKeyExchangeMsg, error) { // Save the client version for comparison later. ka.clientVersion = clientHello.vers @@ -347,6 +348,90 @@ func (e *x25519ECDHCurve) finish(peerKey []byte) (preMasterSecret []byte, err er return out[:], nil } +// cecpq2Curve implements CECPQ2, which is HRSS+SXY combined with X25519. +type cecpq2Curve struct { + x25519PrivateKey [32]byte + hrssPrivateKey hrss.PrivateKey +} + +func (e *cecpq2Curve) offer(rand io.Reader) (publicKey []byte, err error) { + if _, err := io.ReadFull(rand, e.x25519PrivateKey[:]); err != nil { + return nil, err + } + + var x25519Public [32]byte + curve25519.ScalarBaseMult(&x25519Public, &e.x25519PrivateKey) + + e.hrssPrivateKey = hrss.GenerateKey(rand) + hrssPublic := e.hrssPrivateKey.PublicKey.Marshal() + + var ret []byte + ret = append(ret, x25519Public[:]...) + ret = append(ret, hrssPublic...) + return ret, nil +} + +func (e *cecpq2Curve) accept(rand io.Reader, peerKey []byte) (publicKey []byte, preMasterSecret []byte, err error) { + if len(peerKey) != 32+hrss.PublicKeySize { + return nil, nil, errors.New("tls: bad length CECPQ2 offer") + } + + if _, err := io.ReadFull(rand, e.x25519PrivateKey[:]); err != nil { + return nil, nil, err + } + + var x25519Shared, x25519PeerKey, x25519Public [32]byte + copy(x25519PeerKey[:], peerKey) + curve25519.ScalarBaseMult(&x25519Public, &e.x25519PrivateKey) + curve25519.ScalarMult(&x25519Shared, &e.x25519PrivateKey, &x25519PeerKey) + + // Per RFC 7748, reject the all-zero value in constant time. + var zeros [32]byte + if subtle.ConstantTimeCompare(zeros[:], x25519Shared[:]) == 1 { + return nil, nil, errors.New("tls: X25519 value with wrong order") + } + + hrssPublicKey, ok := hrss.ParsePublicKey(peerKey[32:]) + if !ok { + return nil, nil, errors.New("tls: bad CECPQ2 offer") + } + + hrssCiphertext, hrssShared := hrssPublicKey.Encap(rand) + + publicKey = append(publicKey, x25519Public[:]...) + publicKey = append(publicKey, hrssCiphertext...) + preMasterSecret = append(preMasterSecret, x25519Shared[:]...) + preMasterSecret = append(preMasterSecret, hrssShared...) + + return publicKey, preMasterSecret, nil +} + +func (e *cecpq2Curve) finish(peerKey []byte) (preMasterSecret []byte, err error) { + if len(peerKey) != 32+hrss.CiphertextSize { + return nil, errors.New("tls: bad length CECPQ2 reply") + } + + var x25519Shared, x25519PeerKey [32]byte + copy(x25519PeerKey[:], peerKey) + curve25519.ScalarMult(&x25519Shared, &e.x25519PrivateKey, &x25519PeerKey) + + // Per RFC 7748, reject the all-zero value in constant time. + var zeros [32]byte + if subtle.ConstantTimeCompare(zeros[:], x25519Shared[:]) == 1 { + return nil, errors.New("tls: X25519 value with wrong order") + } + + hrssShared, ok := e.hrssPrivateKey.Decap(peerKey[32:]) + if !ok { + return nil, errors.New("tls: invalid HRSS ciphertext") + } + + preMasterSecret = append(preMasterSecret, x25519Shared[:]...) + preMasterSecret = append(preMasterSecret, hrssShared...) + + return preMasterSecret, nil +} + func curveForCurveID(id CurveID, config *Config) (ecdhCurve, bool) { switch id { case CurveP224: @@ -359,6 +444,8 @@ func curveForCurveID(id CurveID, config *Config) (ecdhCurve, bool) { return &ellipticECDHCurve{curve: elliptic.P521(), sendCompressed: config.Bugs.SendCompressedCoordinates}, true case CurveX25519: return &x25519ECDHCurve{setHighBit: config.Bugs.SetX25519HighBit}, true + case CurveCECPQ2: + return &cecpq2Curve{}, true default: return nil, false } @@ -501,12 +588,17 @@ type ecdheKeyAgreement struct { peerKey []byte } -func (ka *ecdheKeyAgreement) generateServerKeyExchange(config *Config, cert *Certificate, clientHello *clientHelloMsg, hello *serverHelloMsg) (*serverKeyExchangeMsg, error) { +func (ka *ecdheKeyAgreement) generateServerKeyExchange(config *Config, cert *Certificate, clientHello *clientHelloMsg, hello *serverHelloMsg, version uint16) (*serverKeyExchangeMsg, error) { var curveid CurveID preferredCurves := config.curvePreferences() NextCandidate: for _, candidate := range preferredCurves { + if candidate == CurveCECPQ2 && version < VersionTLS13 { + // CECPQ2 is TLS 1.3-only. + continue + } + for _, c := range clientHello.supportedCurves { if candidate == c { curveid = c @@ -614,7 +706,7 @@ func (ka *ecdheKeyAgreement) peerSignatureAlgorithm() signatureAlgorithm { // exchange. type nilKeyAgreement struct{} -func (ka *nilKeyAgreement) generateServerKeyExchange(config *Config, cert *Certificate, clientHello *clientHelloMsg, hello *serverHelloMsg) (*serverKeyExchangeMsg, error) { +func (ka *nilKeyAgreement) generateServerKeyExchange(config *Config, cert *Certificate, clientHello *clientHelloMsg, hello *serverHelloMsg, version uint16) (*serverKeyExchangeMsg, error) { return nil, nil } @@ -666,7 +758,7 @@ type pskKeyAgreement struct { identityHint string } -func (ka *pskKeyAgreement) generateServerKeyExchange(config *Config, cert *Certificate, clientHello *clientHelloMsg, hello *serverHelloMsg) (*serverKeyExchangeMsg, error) { +func (ka *pskKeyAgreement) generateServerKeyExchange(config *Config, cert *Certificate, clientHello *clientHelloMsg, hello *serverHelloMsg, version uint16) (*serverKeyExchangeMsg, error) { // Assemble the identity hint. bytes := make([]byte, 2+len(config.PreSharedKeyIdentity)) bytes[0] = byte(len(config.PreSharedKeyIdentity) >> 8) @@ -675,7 +767,7 @@ func (ka *pskKeyAgreement) generateServerKeyExchange(config *Config, cert *Certi // If there is one, append the base key agreement's // ServerKeyExchange. - baseSkx, err := ka.base.generateServerKeyExchange(config, cert, clientHello, hello) + baseSkx, err := ka.base.generateServerKeyExchange(config, cert, clientHello, hello, version) if err != nil { return nil, err } diff --git a/ssl/test/runner/runner.go b/ssl/test/runner/runner.go index da81f236..cbce0651 100644 --- a/ssl/test/runner/runner.go +++ b/ssl/test/runner/runner.go @@ -9619,7 +9619,7 @@ func addSignatureAlgorithmTests() { CipherSuites: []uint16{TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256}, Certificates: []Certificate{ecdsaP256Certificate}, }, - flags: []string{"-p384-only"}, + flags: []string{"-curves", strconv.Itoa(int(CurveP384))}, shouldFail: true, expectedError: ":BAD_ECC_CERT:", }) @@ -9631,7 +9631,7 @@ func addSignatureAlgorithmTests() { MaxVersion: VersionTLS13, Certificates: []Certificate{ecdsaP256Certificate}, }, - flags: []string{"-p384-only"}, + flags: []string{"-curves", strconv.Itoa(int(CurveP384))}, }) // In TLS 1.2, the ECDSA curve is not in the signature algorithm. @@ -10711,6 +10711,7 @@ var testCurves = []struct { {"P-384", CurveP384}, {"P-521", CurveP521}, {"X25519", CurveX25519}, + {"CECPQ2", CurveCECPQ2}, } const bogusCurve = 0x1234 @@ -10718,6 +10719,10 @@ const bogusCurve = 0x1234 func addCurveTests() { for _, curve := range testCurves { for _, ver := range tlsVersions { + if curve.id == CurveCECPQ2 && ver.version < VersionTLS13 { + continue + } + suffix := curve.name + "-" + ver.name testCases = append(testCases, testCase{ @@ -10758,7 +10763,7 @@ func addCurveTests() { expectedCurveID: curve.id, }) - if curve.id != CurveX25519 { + if curve.id != CurveX25519 && curve.id != CurveCECPQ2 { testCases = append(testCases, testCase{ name: "CurveTest-Client-Compressed-" + suffix, config: Config{ @@ -10902,7 +10907,7 @@ func addCurveTests() { IgnorePeerCurvePreferences: true, }, }, - flags: []string{"-p384-only"}, + flags: []string{"-curves", strconv.Itoa(int(CurveP384))}, shouldFail: true, expectedError: ":WRONG_CURVE:", }) @@ -10918,7 +10923,7 @@ func addCurveTests() { SendCurve: CurveP256, }, }, - flags: []string{"-p384-only"}, + flags: []string{"-curves", strconv.Itoa(int(CurveP384))}, shouldFail: true, expectedError: ":WRONG_CURVE:", }) @@ -11169,6 +11174,112 @@ func addCurveTests() { }, }, }) + + // CECPQ2 should not be offered by a TLS < 1.3 client. + testCases = append(testCases, testCase{ + name: "CECPQ2NotInTLS12", + config: Config{ + Bugs: ProtocolBugs{ + FailIfCECPQ2Offered: true, + }, + }, + flags: []string{ + "-max-version", strconv.Itoa(VersionTLS12), + "-curves", strconv.Itoa(int(CurveCECPQ2)), + "-curves", strconv.Itoa(int(CurveX25519)), + }, + }) + + // CECPQ2 should not crash a TLS < 1.3 client if the server mistakenly + // selects it. + testCases = append(testCases, testCase{ + name: "CECPQ2NotAcceptedByTLS12Client", + config: Config{ + Bugs: ProtocolBugs{ + SendCurve: CurveCECPQ2, + }, + }, + flags: []string{ + "-max-version", strconv.Itoa(VersionTLS12), + "-curves", strconv.Itoa(int(CurveCECPQ2)), + "-curves", strconv.Itoa(int(CurveX25519)), + }, + shouldFail: true, + expectedError: ":WRONG_CURVE:", + }) + + // CECPQ2 should not be offered by default as a client. + testCases = append(testCases, testCase{ + name: "CECPQ2NotEnabledByDefaultInClients", + config: Config{ + MinVersion: VersionTLS13, + Bugs: ProtocolBugs{ + FailIfCECPQ2Offered: true, + }, + }, + }) + + // If CECPQ2 is offered, both X25519 and CECPQ2 should have a key-share. + testCases = append(testCases, testCase{ + name: "NotJustCECPQ2KeyShare", + config: Config{ + MinVersion: VersionTLS13, + Bugs: ProtocolBugs{ + ExpectedKeyShares: []CurveID{CurveCECPQ2, CurveX25519}, + }, + }, + flags: []string{ + "-curves", strconv.Itoa(int(CurveCECPQ2)), + "-curves", strconv.Itoa(int(CurveX25519)), + "-expect-curve-id", strconv.Itoa(int(CurveCECPQ2)), + }, + }) + + // ... but only if CECPQ2 is listed first. + testCases = append(testCases, testCase{ + name: "CECPQ2KeyShareNotIncludedSecond", + config: Config{ + MinVersion: VersionTLS13, + Bugs: ProtocolBugs{ + ExpectedKeyShares: []CurveID{CurveX25519}, + }, + }, + flags: []string{ + "-curves", strconv.Itoa(int(CurveX25519)), + "-curves", strconv.Itoa(int(CurveCECPQ2)), + "-expect-curve-id", strconv.Itoa(int(CurveX25519)), + }, + }) + + // If CECPQ2 is the only configured curve, the key share is sent. + testCases = append(testCases, testCase{ + name: "JustConfiguringCECPQ2Works", + config: Config{ + MinVersion: VersionTLS13, + Bugs: ProtocolBugs{ + ExpectedKeyShares: []CurveID{CurveCECPQ2}, + }, + }, + flags: []string{ + "-curves", strconv.Itoa(int(CurveCECPQ2)), + "-expect-curve-id", strconv.Itoa(int(CurveCECPQ2)), + }, + }) + + // As a server, CECPQ2 is not yet supported by default. + testCases = append(testCases, testCase{ + testType: serverTest, + name: "CECPQ2NotEnabledByDefaultForAServer", + config: Config{ + MinVersion: VersionTLS13, + CurvePreferences: []CurveID{CurveCECPQ2, CurveX25519}, + DefaultCurves: []CurveID{CurveCECPQ2}, + }, + flags: []string{ + "-server-preference", + "-expect-curve-id", strconv.Itoa(int(CurveX25519)), + }, + }) } func addTLS13RecordTests() { @@ -12706,7 +12817,7 @@ func addTLS13HandshakeTests() { }, }, tls13Variant: variant, - flags: []string{"-p384-only"}, + flags: []string{"-curves", strconv.Itoa(int(CurveP384))}, shouldFail: true, expectedError: ":WRONG_CURVE:", }) diff --git a/ssl/test/test_config.cc b/ssl/test/test_config.cc index 7447d5ad..9a5c9b26 100644 --- a/ssl/test/test_config.cc +++ b/ssl/test/test_config.cc @@ -104,7 +104,6 @@ const Flag kBoolFlags[] = { { "-renegotiate-ignore", &TestConfig::renegotiate_ignore }, { "-forbid-renegotiation-after-handshake", &TestConfig::forbid_renegotiation_after_handshake }, - { "-p384-only", &TestConfig::p384_only }, { "-enable-all-curves", &TestConfig::enable_all_curves }, { "-use-old-client-cert-callback", &TestConfig::use_old_client_cert_callback }, @@ -147,6 +146,7 @@ const Flag kBoolFlags[] = { { "-handshaker-resume", &TestConfig::handshaker_resume }, { "-reverify-on-resume", &TestConfig::reverify_on_resume }, { "-jdk11-workaround", &TestConfig::jdk11_workaround }, + { "-server-preference", &TestConfig::server_preference }, }; const Flag kStringFlags[] = { @@ -220,10 +220,10 @@ const Flag kIntFlags[] = { }; const Flag> kIntVectorFlags[] = { - { "-signing-prefs", &TestConfig::signing_prefs }, - { "-verify-prefs", &TestConfig::verify_prefs }, - { "-expect-peer-verify-pref", - &TestConfig::expected_peer_verify_prefs }, + {"-signing-prefs", &TestConfig::signing_prefs}, + {"-verify-prefs", &TestConfig::verify_prefs}, + {"-expect-peer-verify-pref", &TestConfig::expected_peer_verify_prefs}, + {"-curves", &TestConfig::curves}, }; bool ParseFlag(char *flag, int argc, char **argv, int *i, @@ -1294,7 +1294,6 @@ bssl::UniquePtr TestConfig::SetupCtx(SSL_CTX *old_ctx) const { return nullptr; } - if (install_cert_compression_algs && (!SSL_CTX_add_cert_compression_alg( ssl_ctx.get(), 0xff02, @@ -1341,6 +1340,10 @@ bssl::UniquePtr TestConfig::SetupCtx(SSL_CTX *old_ctx) const { abort(); } + if (server_preference) { + SSL_CTX_set_options(ssl_ctx.get(), SSL_OP_CIPHER_SERVER_PREFERENCE); + } + return ssl_ctx; } @@ -1589,16 +1592,43 @@ bssl::UniquePtr TestConfig::NewSSL( if (!check_close_notify) { SSL_set_quiet_shutdown(ssl.get(), 1); } - if (p384_only) { - int nid = NID_secp384r1; - if (!SSL_set1_curves(ssl.get(), &nid, 1)) { - return nullptr; + if (!curves.empty()) { + std::vector nids; + for (auto curve : curves) { + switch (curve) { + case SSL_CURVE_SECP224R1: + nids.push_back(NID_secp224r1); + break; + + case SSL_CURVE_SECP256R1: + nids.push_back(NID_X9_62_prime256v1); + break; + + case SSL_CURVE_SECP384R1: + nids.push_back(NID_secp384r1); + break; + + case SSL_CURVE_SECP521R1: + nids.push_back(NID_secp521r1); + break; + + case SSL_CURVE_X25519: + nids.push_back(NID_X25519); + break; + + case SSL_CURVE_CECPQ2: + nids.push_back(NID_CECPQ2); + break; + } + if (!SSL_set1_curves(ssl.get(), &nids[0], nids.size())) { + return nullptr; + } } } if (enable_all_curves) { static const int kAllCurves[] = { NID_secp224r1, NID_X9_62_prime256v1, NID_secp384r1, - NID_secp521r1, NID_X25519, + NID_secp521r1, NID_X25519, NID_CECPQ2, }; if (!SSL_set1_curves(ssl.get(), kAllCurves, OPENSSL_ARRAY_SIZE(kAllCurves))) { diff --git a/ssl/test/test_config.h b/ssl/test/test_config.h index bffe9118..0e842c0f 100644 --- a/ssl/test/test_config.h +++ b/ssl/test/test_config.h @@ -33,6 +33,7 @@ struct TestConfig { std::vector signing_prefs; std::vector verify_prefs; std::vector expected_peer_verify_prefs; + std::vector curves; std::string key_file; std::string cert_file; std::string expected_server_name; @@ -122,7 +123,6 @@ struct TestConfig { bool renegotiate_ignore = false; bool forbid_renegotiation_after_handshake = false; int expect_peer_signature_algorithm = 0; - bool p384_only = false; bool enable_all_curves = false; int expect_curve_id = 0; bool use_old_client_cert_callback = false; @@ -170,6 +170,7 @@ struct TestConfig { bool handshaker_resume = false; std::string handshaker_path; bool jdk11_workaround = false; + bool server_preference = false; int argc; char **argv; diff --git a/ssl/tls13_client.cc b/ssl/tls13_client.cc index 0d778962..40913dcf 100644 --- a/ssl/tls13_client.cc +++ b/ssl/tls13_client.cc @@ -165,15 +165,17 @@ static enum ssl_hs_wait_t do_read_hello_retry_request(SSL_HANDSHAKE *hs) { return ssl_hs_error; } - // Check that the HelloRetryRequest does not request the key share that - // was provided in the initial ClientHello. - if (hs->key_share->GroupID() == group_id) { + // Check that the HelloRetryRequest does not request a key share that was + // provided in the initial ClientHello. + if (hs->key_shares[0]->GroupID() == group_id || + (hs->key_shares[1] && hs->key_shares[1]->GroupID() == group_id)) { ssl_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_ILLEGAL_PARAMETER); OPENSSL_PUT_ERROR(SSL, SSL_R_WRONG_CURVE); return ssl_hs_error; } - hs->key_share.reset(); + hs->key_shares[0].reset(); + hs->key_shares[1].reset(); hs->retry_group = group_id; } diff --git a/tool/speed.cc b/tool/speed.cc index 2175baa2..975fb531 100644 --- a/tool/speed.cc +++ b/tool/speed.cc @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -744,6 +745,61 @@ static bool SpeedScrypt(const std::string &selected) { return true; } +static bool SpeedHRSS(const std::string &selected) { + if (!selected.empty() && selected != "HRSS") { + return true; + } + + TimeResults results; + + if (!TimeFunction(&results, []() -> bool { + struct HRSS_public_key pub; + struct HRSS_private_key priv; + uint8_t entropy[HRSS_GENERATE_KEY_BYTES]; + RAND_bytes(entropy, sizeof(entropy)); + HRSS_generate_key(&pub, &priv, entropy); + return true; + })) { + fprintf(stderr, "Failed to time HRSS_generate_key.\n"); + return false; + } + + results.Print("HRSS generate"); + + struct HRSS_public_key pub; + struct HRSS_private_key priv; + uint8_t key_entropy[HRSS_GENERATE_KEY_BYTES]; + RAND_bytes(key_entropy, sizeof(key_entropy)); + HRSS_generate_key(&pub, &priv, key_entropy); + + uint8_t ciphertext[HRSS_CIPHERTEXT_BYTES]; + if (!TimeFunction(&results, [&pub, &ciphertext]() -> bool { + uint8_t entropy[HRSS_ENCAP_BYTES]; + uint8_t shared_key[HRSS_KEY_BYTES]; + RAND_bytes(entropy, sizeof(entropy)); + HRSS_encap(ciphertext, shared_key, &pub, entropy); + return true; + })) { + fprintf(stderr, "Failed to time HRSS_encap.\n"); + return false; + } + + results.Print("HRSS encap"); + + if (!TimeFunction(&results, [&pub, &priv, &ciphertext]() -> bool { + uint8_t shared_key[HRSS_KEY_BYTES]; + HRSS_decap(shared_key, &pub, &priv, ciphertext, sizeof(ciphertext)); + return true; + })) { + fprintf(stderr, "Failed to time HRSS_encap.\n"); + return false; + } + + results.Print("HRSS decap"); + + return true; +} + static const struct argument kArguments[] = { { "-filter", kOptionalArgument, @@ -817,7 +873,8 @@ bool Speed(const std::vector &args) { !Speed25519(selected) || !SpeedSPAKE2(selected) || !SpeedScrypt(selected) || - !SpeedRSAKeyGen(selected)) { + !SpeedRSAKeyGen(selected) || + !SpeedHRSS(selected)) { return false; } diff --git a/util/generate_build_files.py b/util/generate_build_files.py index 9c635dcf..44db7f57 100644 --- a/util/generate_build_files.py +++ b/util/generate_build_files.py @@ -43,6 +43,10 @@ NON_PERL_FILES = { ('linux', 'arm'): [ 'src/crypto/curve25519/asm/x25519-asm-arm.S', 'src/crypto/poly1305/poly1305_arm_asm.S', + 'src/crypto/hrss/asm/poly_mul_vec_armv7_neon.S', + ], + ('linux', 'x86_64'): [ + 'src/crypto/hrss/asm/poly_rq_mul.S', ], }