NTRU: inline the one call that needed @plt

Этот коммит содержится в:
John M. Schanck 2020-09-02 12:54:00 -04:00 коммит произвёл Kris Kwiatkowski
родитель a008d4ad21
Коммит 4c268aae14
6 изменённых файлов: 278 добавлений и 11 удалений

Просмотреть файл

@ -23,9 +23,9 @@ auxiliary-submitters:
- Zhenfei Zhang - Zhenfei Zhang
implementations: implementations:
- name: clean - name: clean
version: https://github.com/jschanck/ntru/tree/b43afe59 reference implementation version: https://github.com/jschanck/ntru/tree/ff3c84e1 reference implementation
- name: avx2 - name: avx2
version: https://github.com/jschanck/ntru/tree/b43afe59 avx2 implementation version: https://github.com/jschanck/ntru/tree/ff3c84e1 avx2 implementation
supported_platforms: supported_platforms:
- architecture: x86_64 - architecture: x86_64
operating_systems: operating_systems:

Просмотреть файл

@ -23,9 +23,9 @@ auxiliary-submitters:
- Zhenfei Zhang - Zhenfei Zhang
implementations: implementations:
- name: clean - name: clean
version: https://github.com/jschanck/ntru/tree/b43afe59 reference implementation version: https://github.com/jschanck/ntru/tree/ff3c84e1 reference implementation
- name: avx2 - name: avx2
version: https://github.com/jschanck/ntru/tree/b43afe59 avx2 implementation version: https://github.com/jschanck/ntru/tree/ff3c84e1 avx2 implementation
supported_platforms: supported_platforms:
- architecture: x86_64 - architecture: x86_64
operating_systems: operating_systems:

Просмотреть файл

@ -23,9 +23,9 @@ auxiliary-submitters:
- Zhenfei Zhang - Zhenfei Zhang
implementations: implementations:
- name: clean - name: clean
version: https://github.com/jschanck/ntru/tree/b43afe59 reference implementation version: https://github.com/jschanck/ntru/tree/ff3c84e1 reference implementation
- name: avx2 - name: avx2
version: https://github.com/jschanck/ntru/tree/b43afe59 avx2 implementation version: https://github.com/jschanck/ntru/tree/ff3c84e1 avx2 implementation
supported_platforms: supported_platforms:
- architecture: x86_64 - architecture: x86_64
operating_systems: operating_systems:

Просмотреть файл

@ -23,9 +23,9 @@ auxiliary-submitters:
- Zhenfei Zhang - Zhenfei Zhang
implementations: implementations:
- name: clean - name: clean
version: https://github.com/jschanck/ntru/tree/b43afe59 reference implementation version: https://github.com/jschanck/ntru/tree/ff3c84e1 reference implementation
- name: avx2 - name: avx2
version: https://github.com/jschanck/ntru/tree/b43afe59 avx2 implementation version: https://github.com/jschanck/ntru/tree/ff3c84e1 avx2 implementation
supported_platforms: supported_platforms:
- architecture: x86_64 - architecture: x86_64
operating_systems: operating_systems:

Просмотреть файл

@ -4,7 +4,7 @@ LIB=libntruhrss701_avx2.a
HEADERS=api.h cmov.h owcpa.h params.h poly.h poly_r2_inv.h sample.h HEADERS=api.h cmov.h owcpa.h params.h poly.h poly_r2_inv.h sample.h
OBJECTS=cmov.o kem.o owcpa.o pack3.o packq.o poly.o poly_r2_inv.o sample.o sample_iid.o \ OBJECTS=cmov.o kem.o owcpa.o pack3.o packq.o poly.o poly_r2_inv.o sample.o sample_iid.o \
square_1_701_patience.o square_3_701_patience.o square_6_701_patience.o square_12_701_shufbytes.o square_15_701_shufbytes.o square_27_701_shufbytes.o square_42_701_shufbytes.o square_84_701_shufbytes.o square_168_701_shufbytes.o square_336_701_shufbytes.o \ square_1_701_patience.o square_3_701_patience.o square_6_701_patience.o square_12_701_shufbytes.o square_15_701_shufbytes.o square_27_701_shufbytes.o square_42_701_shufbytes.o square_84_701_shufbytes.o square_168_701_shufbytes.o square_336_701_shufbytes.o \
poly_mod_3_Phi_n.o poly_mod_q_Phi_n.o poly_r2_mul.o poly_rq_mul.o poly_rq_mul_x_minus_1.o poly_rq_to_s3.o poly_s3_inv.o poly_s3_to_rq.o vec32_sample_iid.o poly_lift.o poly_mod_3_Phi_n.o poly_mod_q_Phi_n.o poly_r2_mul.o poly_rq_mul.o poly_rq_to_s3.o poly_s3_inv.o vec32_sample_iid.o
CFLAGS=-O3 -mavx2 -mbmi2 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS) CFLAGS=-O3 -mavx2 -mbmi2 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS)

Просмотреть файл

@ -288,6 +288,90 @@ mask_n:
.word 0 .word 0
.word 0 .word 0
.word 0 .word 0
mask_mod8192:
.word 8191
.word 8191
.word 8191
.word 8191
.word 8191
.word 8191
.word 8191
.word 8191
.word 8191
.word 8191
.word 8191
.word 8191
.word 8191
.word 8191
.word 8191
.word 8191
mask_mod8192_omit_lowest:
.word 0
.word 8191
.word 8191
.word 8191
.word 8191
.word 8191
.word 8191
.word 8191
.word 8191
.word 8191
.word 8191
.word 8191
.word 8191
.word 8191
.word 8191
.word 8191
mask_mod8192_only_lowest:
.word 8191
.word 0
.word 0
.word 0
.word 0
.word 0
.word 0
.word 0
.word 0
.word 0
.word 0
.word 0
.word 0
.word 0
.word 0
.word 0
shuf_5_to_0_zerorest:
.byte 10
.byte 11
.byte 255
.byte 255
.byte 255
.byte 255
.byte 255
.byte 255
.byte 255
.byte 255
.byte 255
.byte 255
.byte 255
.byte 255
.byte 255
.byte 255
.byte 255
.byte 255
.byte 255
.byte 255
.byte 255
.byte 255
.byte 255
.byte 255
.byte 255
.byte 255
.byte 255
.byte 255
.byte 255
.byte 255
.byte 255
.byte 255
.text .text
.global PQCLEAN_NTRUHRSS701_AVX2_poly_lift .global PQCLEAN_NTRUHRSS701_AVX2_poly_lift
.global _PQCLEAN_NTRUHRSS701_AVX2_poly_lift .global _PQCLEAN_NTRUHRSS701_AVX2_poly_lift
@ -3112,7 +3196,190 @@ vpand const_modq(%rip), %ymm2, %ymm2
vpand const_1s(%rip), %ymm3, %ymm3 vpand const_1s(%rip), %ymm3, %ymm3
vpor %ymm3, %ymm2, %ymm3 vpor %ymm3, %ymm2, %ymm3
vmovdqa %ymm3, 1376(%rsp) vmovdqa %ymm3, 1376(%rsp)
mov %rsp, %rsi vmovdqu 1374(%rsp), %ymm0
call PQCLEAN_NTRUHRSS701_AVX2_poly_Rq_mul_x_minus_1@plt vpsubw 1376(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 1376(%rdi)
vextracti128 $1, %ymm0, %xmm4
vpshufb shuf_5_to_0_zerorest(%rip), %ymm4, %ymm4
vpsubw 0(%rsp), %ymm4, %ymm4
vpand mask_mod8192_only_lowest(%rip), %ymm4, %ymm4
vmovdqu 1342(%rsp), %ymm0
vpsubw 1344(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 1344(%rdi)
vmovdqu 1310(%rsp), %ymm0
vpsubw 1312(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 1312(%rdi)
vmovdqu 1278(%rsp), %ymm0
vpsubw 1280(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 1280(%rdi)
vmovdqu 1246(%rsp), %ymm0
vpsubw 1248(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 1248(%rdi)
vmovdqu 1214(%rsp), %ymm0
vpsubw 1216(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 1216(%rdi)
vmovdqu 1182(%rsp), %ymm0
vpsubw 1184(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 1184(%rdi)
vmovdqu 1150(%rsp), %ymm0
vpsubw 1152(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 1152(%rdi)
vmovdqu 1118(%rsp), %ymm0
vpsubw 1120(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 1120(%rdi)
vmovdqu 1086(%rsp), %ymm0
vpsubw 1088(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 1088(%rdi)
vmovdqu 1054(%rsp), %ymm0
vpsubw 1056(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 1056(%rdi)
vmovdqu 1022(%rsp), %ymm0
vpsubw 1024(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 1024(%rdi)
vmovdqu 990(%rsp), %ymm0
vpsubw 992(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 992(%rdi)
vmovdqu 958(%rsp), %ymm0
vpsubw 960(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 960(%rdi)
vmovdqu 926(%rsp), %ymm0
vpsubw 928(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 928(%rdi)
vmovdqu 894(%rsp), %ymm0
vpsubw 896(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 896(%rdi)
vmovdqu 862(%rsp), %ymm0
vpsubw 864(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 864(%rdi)
vmovdqu 830(%rsp), %ymm0
vpsubw 832(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 832(%rdi)
vmovdqu 798(%rsp), %ymm0
vpsubw 800(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 800(%rdi)
vmovdqu 766(%rsp), %ymm0
vpsubw 768(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 768(%rdi)
vmovdqu 734(%rsp), %ymm0
vpsubw 736(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 736(%rdi)
vmovdqu 702(%rsp), %ymm0
vpsubw 704(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 704(%rdi)
vmovdqu 670(%rsp), %ymm0
vpsubw 672(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 672(%rdi)
vmovdqu 638(%rsp), %ymm0
vpsubw 640(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 640(%rdi)
vmovdqu 606(%rsp), %ymm0
vpsubw 608(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 608(%rdi)
vmovdqu 574(%rsp), %ymm0
vpsubw 576(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 576(%rdi)
vmovdqu 542(%rsp), %ymm0
vpsubw 544(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 544(%rdi)
vmovdqu 510(%rsp), %ymm0
vpsubw 512(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 512(%rdi)
vmovdqu 478(%rsp), %ymm0
vpsubw 480(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 480(%rdi)
vmovdqu 446(%rsp), %ymm0
vpsubw 448(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 448(%rdi)
vmovdqu 414(%rsp), %ymm0
vpsubw 416(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 416(%rdi)
vmovdqu 382(%rsp), %ymm0
vpsubw 384(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 384(%rdi)
vmovdqu 350(%rsp), %ymm0
vpsubw 352(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 352(%rdi)
vmovdqu 318(%rsp), %ymm0
vpsubw 320(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 320(%rdi)
vmovdqu 286(%rsp), %ymm0
vpsubw 288(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 288(%rdi)
vmovdqu 254(%rsp), %ymm0
vpsubw 256(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 256(%rdi)
vmovdqu 222(%rsp), %ymm0
vpsubw 224(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 224(%rdi)
vmovdqu 190(%rsp), %ymm0
vpsubw 192(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 192(%rdi)
vmovdqu 158(%rsp), %ymm0
vpsubw 160(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 160(%rdi)
vmovdqu 126(%rsp), %ymm0
vpsubw 128(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 128(%rdi)
vmovdqu 94(%rsp), %ymm0
vpsubw 96(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 96(%rdi)
vmovdqu 62(%rsp), %ymm0
vpsubw 64(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 64(%rdi)
vmovdqu 30(%rsp), %ymm0
vpsubw 32(%rsp), %ymm0, %ymm1
vpand mask_mod8192(%rip), %ymm1, %ymm1
vmovdqa %ymm1, 32(%rdi)
vmovdqa 0(%rsp), %ymm3
vpsrlq $48, %ymm3, %ymm0
vpermq $147, %ymm0, %ymm0
vpsllq $16, %ymm3, %ymm2
vpxor %ymm0, %ymm2, %ymm2
vpsubw %ymm3, %ymm2, %ymm3
vpand mask_mod8192_omit_lowest(%rip), %ymm3, %ymm3
vpxor %ymm3, %ymm4, %ymm3
vmovdqa %ymm3, 0(%rdi)
mov %r8, %rsp mov %r8, %rsp
ret ret