From ef4895b55614f6926c384c8986d41d1e7eb6b2d8 Mon Sep 17 00:00:00 2001 From: Kris Kwiatkowski Date: Tue, 6 Apr 2021 23:41:01 +0100 Subject: [PATCH 01/12] pull SIKE --- src/kem/sike/sike-p434-sha256/asm/fp-x86_64.S | 1095 +++++++++++++++++ .../sike/sike-p434-sha256/asm/fp_generic.c | 179 +++ src/kem/sike/sike-p434-sha256/fpx.c | 282 +++++ src/kem/sike/sike-p434-sha256/fpx.h | 112 ++ src/kem/sike/sike-p434-sha256/isogeny.c | 262 ++++ src/kem/sike/sike-p434-sha256/isogeny.h | 49 + src/kem/sike/sike-p434-sha256/params.c | 128 ++ src/kem/sike/sike-p434-sha256/sike.c | 517 ++++++++ src/kem/sike/sike-p434-sha256/utils.h | 231 ++++ 9 files changed, 2855 insertions(+) create mode 100644 src/kem/sike/sike-p434-sha256/asm/fp-x86_64.S create mode 100644 src/kem/sike/sike-p434-sha256/asm/fp_generic.c create mode 100644 src/kem/sike/sike-p434-sha256/fpx.c create mode 100644 src/kem/sike/sike-p434-sha256/fpx.h create mode 100644 src/kem/sike/sike-p434-sha256/isogeny.c create mode 100644 src/kem/sike/sike-p434-sha256/isogeny.h create mode 100644 src/kem/sike/sike-p434-sha256/params.c create mode 100644 src/kem/sike/sike-p434-sha256/sike.c create mode 100644 src/kem/sike/sike-p434-sha256/utils.h diff --git a/src/kem/sike/sike-p434-sha256/asm/fp-x86_64.S b/src/kem/sike/sike-p434-sha256/asm/fp-x86_64.S new file mode 100644 index 00000000..4e2d7b74 --- /dev/null +++ b/src/kem/sike/sike-p434-sha256/asm/fp-x86_64.S @@ -0,0 +1,1095 @@ +# This file is generated from a similarly-named Perl script in the BoringSSL +# source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include +#endif +.text + + +.Lp434x2: +.quad 0xFFFFFFFFFFFFFFFE +.quad 0xFFFFFFFFFFFFFFFF +.quad 0xFB82ECF5C5FFFFFF +.quad 0xF78CB8F062B15D47 +.quad 0xD9F8BFAD038A40AC +.quad 0x0004683E4E2EE688 + + +.Lp434p1: +.quad 0xFDC1767AE3000000 +.quad 0x7BC65C783158AEA3 +.quad 0x6CFC5FD681C52056 +.quad 0x0002341F27177344 + +.globl sike_fpadd +.hidden sike_fpadd +.type sike_fpadd,@function +sike_fpadd: +.cfi_startproc + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset r12, -16 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset r13, -24 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset r14, -32 + + xorq %rax,%rax + + movq 0(%rdi),%r8 + addq 0(%rsi),%r8 + movq 8(%rdi),%r9 + adcq 8(%rsi),%r9 + movq 16(%rdi),%r10 + adcq 16(%rsi),%r10 + movq 24(%rdi),%r11 + adcq 24(%rsi),%r11 + movq 32(%rdi),%r12 + adcq 32(%rsi),%r12 + movq 40(%rdi),%r13 + adcq 40(%rsi),%r13 + movq 48(%rdi),%r14 + adcq 48(%rsi),%r14 + + movq .Lp434x2(%rip),%rcx + subq %rcx,%r8 + movq 8+.Lp434x2(%rip),%rcx + sbbq %rcx,%r9 + sbbq %rcx,%r10 + movq 16+.Lp434x2(%rip),%rcx + sbbq %rcx,%r11 + movq 24+.Lp434x2(%rip),%rcx + sbbq %rcx,%r12 + movq 32+.Lp434x2(%rip),%rcx + sbbq %rcx,%r13 + movq 40+.Lp434x2(%rip),%rcx + sbbq %rcx,%r14 + + sbbq $0,%rax + + movq .Lp434x2(%rip),%rdi + andq %rax,%rdi + movq 8+.Lp434x2(%rip),%rsi + andq %rax,%rsi + movq 16+.Lp434x2(%rip),%rcx + andq %rax,%rcx + + addq %rdi,%r8 + movq %r8,0(%rdx) + adcq %rsi,%r9 + movq %r9,8(%rdx) + adcq %rsi,%r10 + movq %r10,16(%rdx) + adcq %rcx,%r11 + movq %r11,24(%rdx) + + setc %cl + movq 24+.Lp434x2(%rip),%r8 + andq %rax,%r8 + movq 32+.Lp434x2(%rip),%r9 + andq %rax,%r9 + movq 40+.Lp434x2(%rip),%r10 + andq %rax,%r10 + btq $0,%rcx + + adcq %r8,%r12 + movq %r12,32(%rdx) + adcq %r9,%r13 + movq %r13,40(%rdx) + adcq %r10,%r14 + movq %r14,48(%rdx) + + popq %r14 +.cfi_adjust_cfa_offset -8 + popq %r13 +.cfi_adjust_cfa_offset -8 + popq %r12 +.cfi_adjust_cfa_offset -8 + .byte 0xf3,0xc3 +.cfi_endproc +.globl sike_cswap_asm +.hidden sike_cswap_asm +.type sike_cswap_asm,@function +sike_cswap_asm: + + + movq %rdx,%xmm3 + + + + + + pshufd $68,%xmm3,%xmm3 + + movdqu 0(%rdi),%xmm0 + movdqu 0(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,0(%rdi) + movdqu %xmm1,0(%rsi) + + movdqu 16(%rdi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,16(%rdi) + movdqu %xmm1,16(%rsi) + + movdqu 32(%rdi),%xmm0 + movdqu 32(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,32(%rdi) + movdqu %xmm1,32(%rsi) + + movdqu 48(%rdi),%xmm0 + movdqu 48(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,48(%rdi) + movdqu %xmm1,48(%rsi) + + movdqu 64(%rdi),%xmm0 + movdqu 64(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,64(%rdi) + movdqu %xmm1,64(%rsi) + + movdqu 80(%rdi),%xmm0 + movdqu 80(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,80(%rdi) + movdqu %xmm1,80(%rsi) + + movdqu 96(%rdi),%xmm0 + movdqu 96(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,96(%rdi) + movdqu %xmm1,96(%rsi) + + movdqu 112(%rdi),%xmm0 + movdqu 112(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,112(%rdi) + movdqu %xmm1,112(%rsi) + + movdqu 128(%rdi),%xmm0 + movdqu 128(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,128(%rdi) + movdqu %xmm1,128(%rsi) + + movdqu 144(%rdi),%xmm0 + movdqu 144(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,144(%rdi) + movdqu %xmm1,144(%rsi) + + movdqu 160(%rdi),%xmm0 + movdqu 160(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,160(%rdi) + movdqu %xmm1,160(%rsi) + + movdqu 176(%rdi),%xmm0 + movdqu 176(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,176(%rdi) + movdqu %xmm1,176(%rsi) + + movdqu 192(%rdi),%xmm0 + movdqu 192(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,192(%rdi) + movdqu %xmm1,192(%rsi) + + movdqu 208(%rdi),%xmm0 + movdqu 208(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,208(%rdi) + movdqu %xmm1,208(%rsi) + + .byte 0xf3,0xc3 +.globl sike_fpsub +.hidden sike_fpsub +.type sike_fpsub,@function +sike_fpsub: +.cfi_startproc + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset r12, -16 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset r13, -24 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset r14, -32 + + xorq %rax,%rax + + movq 0(%rdi),%r8 + subq 0(%rsi),%r8 + movq 8(%rdi),%r9 + sbbq 8(%rsi),%r9 + movq 16(%rdi),%r10 + sbbq 16(%rsi),%r10 + movq 24(%rdi),%r11 + sbbq 24(%rsi),%r11 + movq 32(%rdi),%r12 + sbbq 32(%rsi),%r12 + movq 40(%rdi),%r13 + sbbq 40(%rsi),%r13 + movq 48(%rdi),%r14 + sbbq 48(%rsi),%r14 + + sbbq $0x0,%rax + + movq .Lp434x2(%rip),%rdi + andq %rax,%rdi + movq 8+.Lp434x2(%rip),%rsi + andq %rax,%rsi + movq 16+.Lp434x2(%rip),%rcx + andq %rax,%rcx + + addq %rdi,%r8 + movq %r8,0(%rdx) + adcq %rsi,%r9 + movq %r9,8(%rdx) + adcq %rsi,%r10 + movq %r10,16(%rdx) + adcq %rcx,%r11 + movq %r11,24(%rdx) + + setc %cl + movq 24+.Lp434x2(%rip),%r8 + andq %rax,%r8 + movq 32+.Lp434x2(%rip),%r9 + andq %rax,%r9 + movq 40+.Lp434x2(%rip),%r10 + andq %rax,%r10 + btq $0x0,%rcx + + adcq %r8,%r12 + adcq %r9,%r13 + adcq %r10,%r14 + movq %r12,32(%rdx) + movq %r13,40(%rdx) + movq %r14,48(%rdx) + + popq %r14 +.cfi_adjust_cfa_offset -8 + popq %r13 +.cfi_adjust_cfa_offset -8 + popq %r12 +.cfi_adjust_cfa_offset -8 + .byte 0xf3,0xc3 +.cfi_endproc +.globl sike_mpadd_asm +.hidden sike_mpadd_asm +.type sike_mpadd_asm,@function +sike_mpadd_asm: +.cfi_startproc + movq 0(%rdi),%r8; + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%rcx + addq 0(%rsi),%r8 + adcq 8(%rsi),%r9 + adcq 16(%rsi),%r10 + adcq 24(%rsi),%r11 + adcq 32(%rsi),%rcx + movq %r8,0(%rdx) + movq %r9,8(%rdx) + movq %r10,16(%rdx) + movq %r11,24(%rdx) + movq %rcx,32(%rdx) + + movq 40(%rdi),%r8 + movq 48(%rdi),%r9 + adcq 40(%rsi),%r8 + adcq 48(%rsi),%r9 + movq %r8,40(%rdx) + movq %r9,48(%rdx) + .byte 0xf3,0xc3 +.cfi_endproc +.globl sike_mpsubx2_asm +.hidden sike_mpsubx2_asm +.type sike_mpsubx2_asm,@function +sike_mpsubx2_asm: +.cfi_startproc + xorq %rax,%rax + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%rcx + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + movq %r8,0(%rdx) + movq %r9,8(%rdx) + movq %r10,16(%rdx) + movq %r11,24(%rdx) + movq %rcx,32(%rdx) + + movq 40(%rdi),%r8 + movq 48(%rdi),%r9 + movq 56(%rdi),%r10 + movq 64(%rdi),%r11 + movq 72(%rdi),%rcx + sbbq 40(%rsi),%r8 + sbbq 48(%rsi),%r9 + sbbq 56(%rsi),%r10 + sbbq 64(%rsi),%r11 + sbbq 72(%rsi),%rcx + movq %r8,40(%rdx) + movq %r9,48(%rdx) + movq %r10,56(%rdx) + movq %r11,64(%rdx) + movq %rcx,72(%rdx) + + movq 80(%rdi),%r8 + movq 88(%rdi),%r9 + movq 96(%rdi),%r10 + movq 104(%rdi),%r11 + sbbq 80(%rsi),%r8 + sbbq 88(%rsi),%r9 + sbbq 96(%rsi),%r10 + sbbq 104(%rsi),%r11 + sbbq $0x0,%rax + movq %r8,80(%rdx) + movq %r9,88(%rdx) + movq %r10,96(%rdx) + movq %r11,104(%rdx) + .byte 0xf3,0xc3 +.cfi_endproc +.globl sike_mpdblsubx2_asm +.hidden sike_mpdblsubx2_asm +.type sike_mpdblsubx2_asm,@function +sike_mpdblsubx2_asm: +.cfi_startproc + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset r12, -16 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset r13, -24 + + xorq %rax,%rax + + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq 32(%rdx),%r12 + movq 40(%rdx),%r13 + movq 48(%rdx),%rcx + subq 0(%rdi),%r8 + sbbq 8(%rdi),%r9 + sbbq 16(%rdi),%r10 + sbbq 24(%rdi),%r11 + sbbq 32(%rdi),%r12 + sbbq 40(%rdi),%r13 + sbbq 48(%rdi),%rcx + adcq $0x0,%rax + + + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%r12 + sbbq 40(%rsi),%r13 + sbbq 48(%rsi),%rcx + adcq $0x0,%rax + + + movq %r8,0(%rdx) + movq %r9,8(%rdx) + movq %r10,16(%rdx) + movq %r11,24(%rdx) + movq %r12,32(%rdx) + movq %r13,40(%rdx) + movq %rcx,48(%rdx) + + + movq 56(%rdx),%r8 + movq 64(%rdx),%r9 + movq 72(%rdx),%r10 + movq 80(%rdx),%r11 + movq 88(%rdx),%r12 + movq 96(%rdx),%r13 + movq 104(%rdx),%rcx + + subq %rax,%r8 + sbbq 56(%rdi),%r8 + sbbq 64(%rdi),%r9 + sbbq 72(%rdi),%r10 + sbbq 80(%rdi),%r11 + sbbq 88(%rdi),%r12 + sbbq 96(%rdi),%r13 + sbbq 104(%rdi),%rcx + + + subq 56(%rsi),%r8 + sbbq 64(%rsi),%r9 + sbbq 72(%rsi),%r10 + sbbq 80(%rsi),%r11 + sbbq 88(%rsi),%r12 + sbbq 96(%rsi),%r13 + sbbq 104(%rsi),%rcx + + + movq %r8,56(%rdx) + movq %r9,64(%rdx) + movq %r10,72(%rdx) + movq %r11,80(%rdx) + movq %r12,88(%rdx) + movq %r13,96(%rdx) + movq %rcx,104(%rdx) + + popq %r13 +.cfi_adjust_cfa_offset -8 + popq %r12 +.cfi_adjust_cfa_offset -8 + .byte 0xf3,0xc3 +.cfi_endproc + +.globl sike_fprdc +.hidden sike_fprdc +.type sike_fprdc,@function +sike_fprdc: +.cfi_startproc + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset r12, -16 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset r13, -24 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset r14, -32 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset r15, -40 + + xorq %rax,%rax + movq 0+0(%rdi),%rdx + mulxq 0+.Lp434p1(%rip),%r8,%r9 + mulxq 8+.Lp434p1(%rip),%r12,%r10 + mulxq 16+.Lp434p1(%rip),%r13,%r11 + + adoxq %r12,%r9 + adoxq %r13,%r10 + + mulxq 24+.Lp434p1(%rip),%r13,%r12 + adoxq %r13,%r11 + adoxq %rax,%r12 + + xorq %rax,%rax + movq 0+8(%rdi),%rdx + mulxq 0+.Lp434p1(%rip),%r13,%rcx + adcxq %r13,%r9 + adcxq %rcx,%r10 + + mulxq 8+.Lp434p1(%rip),%rcx,%r13 + adcxq %r13,%r11 + adoxq %rcx,%r10 + + mulxq 16+.Lp434p1(%rip),%rcx,%r13 + adcxq %r13,%r12 + adoxq %rcx,%r11 + + mulxq 24+.Lp434p1(%rip),%rcx,%r13 + adcxq %rax,%r13 + adoxq %rcx,%r12 + adoxq %rax,%r13 + + xorq %rcx,%rcx + addq 24(%rdi),%r8 + adcq 32(%rdi),%r9 + adcq 40(%rdi),%r10 + adcq 48(%rdi),%r11 + adcq 56(%rdi),%r12 + adcq 64(%rdi),%r13 + adcq 72(%rdi),%rcx + movq %r8,24(%rdi) + movq %r9,32(%rdi) + movq %r10,40(%rdi) + movq %r11,48(%rdi) + movq %r12,56(%rdi) + movq %r13,64(%rdi) + movq %rcx,72(%rdi) + movq 80(%rdi),%r8 + movq 88(%rdi),%r9 + movq 96(%rdi),%r10 + movq 104(%rdi),%r11 + adcq $0x0,%r8 + adcq $0x0,%r9 + adcq $0x0,%r10 + adcq $0x0,%r11 + movq %r8,80(%rdi) + movq %r9,88(%rdi) + movq %r10,96(%rdi) + movq %r11,104(%rdi) + + xorq %rax,%rax + movq 16+0(%rdi),%rdx + mulxq 0+.Lp434p1(%rip),%r8,%r9 + mulxq 8+.Lp434p1(%rip),%r12,%r10 + mulxq 16+.Lp434p1(%rip),%r13,%r11 + + adoxq %r12,%r9 + adoxq %r13,%r10 + + mulxq 24+.Lp434p1(%rip),%r13,%r12 + adoxq %r13,%r11 + adoxq %rax,%r12 + + xorq %rax,%rax + movq 16+8(%rdi),%rdx + mulxq 0+.Lp434p1(%rip),%r13,%rcx + adcxq %r13,%r9 + adcxq %rcx,%r10 + + mulxq 8+.Lp434p1(%rip),%rcx,%r13 + adcxq %r13,%r11 + adoxq %rcx,%r10 + + mulxq 16+.Lp434p1(%rip),%rcx,%r13 + adcxq %r13,%r12 + adoxq %rcx,%r11 + + mulxq 24+.Lp434p1(%rip),%rcx,%r13 + adcxq %rax,%r13 + adoxq %rcx,%r12 + adoxq %rax,%r13 + + xorq %rcx,%rcx + addq 40(%rdi),%r8 + adcq 48(%rdi),%r9 + adcq 56(%rdi),%r10 + adcq 64(%rdi),%r11 + adcq 72(%rdi),%r12 + adcq 80(%rdi),%r13 + adcq 88(%rdi),%rcx + movq %r8,40(%rdi) + movq %r9,48(%rdi) + movq %r10,56(%rdi) + movq %r11,64(%rdi) + movq %r12,72(%rdi) + movq %r13,80(%rdi) + movq %rcx,88(%rdi) + movq 96(%rdi),%r8 + movq 104(%rdi),%r9 + adcq $0x0,%r8 + adcq $0x0,%r9 + movq %r8,96(%rdi) + movq %r9,104(%rdi) + + xorq %rax,%rax + movq 32+0(%rdi),%rdx + mulxq 0+.Lp434p1(%rip),%r8,%r9 + mulxq 8+.Lp434p1(%rip),%r12,%r10 + mulxq 16+.Lp434p1(%rip),%r13,%r11 + + adoxq %r12,%r9 + adoxq %r13,%r10 + + mulxq 24+.Lp434p1(%rip),%r13,%r12 + adoxq %r13,%r11 + adoxq %rax,%r12 + + xorq %rax,%rax + movq 32+8(%rdi),%rdx + mulxq 0+.Lp434p1(%rip),%r13,%rcx + adcxq %r13,%r9 + adcxq %rcx,%r10 + + mulxq 8+.Lp434p1(%rip),%rcx,%r13 + adcxq %r13,%r11 + adoxq %rcx,%r10 + + mulxq 16+.Lp434p1(%rip),%rcx,%r13 + adcxq %r13,%r12 + adoxq %rcx,%r11 + + mulxq 24+.Lp434p1(%rip),%rcx,%r13 + adcxq %rax,%r13 + adoxq %rcx,%r12 + adoxq %rax,%r13 + + xorq %rcx,%rcx + addq 56(%rdi),%r8 + adcq 64(%rdi),%r9 + adcq 72(%rdi),%r10 + adcq 80(%rdi),%r11 + adcq 88(%rdi),%r12 + adcq 96(%rdi),%r13 + adcq 104(%rdi),%rcx + movq %r8,0(%rsi) + movq %r9,8(%rsi) + movq %r10,72(%rdi) + movq %r11,80(%rdi) + movq %r12,88(%rdi) + movq %r13,96(%rdi) + movq %rcx,104(%rdi) + + xorq %rax,%rax + movq 48(%rdi),%rdx + mulxq 0+.Lp434p1(%rip),%r8,%r9 + mulxq 8+.Lp434p1(%rip),%r12,%r10 + mulxq 16+.Lp434p1(%rip),%r13,%r11 + + adoxq %r12,%r9 + adoxq %r13,%r10 + + mulxq 24+.Lp434p1(%rip),%r13,%r12 + adoxq %r13,%r11 + adoxq %rax,%r12 + + addq 72(%rdi),%r8 + adcq 80(%rdi),%r9 + adcq 88(%rdi),%r10 + adcq 96(%rdi),%r11 + adcq 104(%rdi),%r12 + movq %r8,16(%rsi) + movq %r9,24(%rsi) + movq %r10,32(%rsi) + movq %r11,40(%rsi) + movq %r12,48(%rsi) + + + popq %r15 +.cfi_adjust_cfa_offset -8 + popq %r14 +.cfi_adjust_cfa_offset -8 + popq %r13 +.cfi_adjust_cfa_offset -8 + popq %r12 +.cfi_adjust_cfa_offset -8 + .byte 0xf3,0xc3 +.cfi_endproc +.globl sike_mpmul +.hidden sike_mpmul +.type sike_mpmul,@function +sike_mpmul: +.cfi_startproc + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset r12, -16 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset r13, -24 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset r14, -32 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset r15, -40 + + + movq %rdx,%rcx + xorq %rax,%rax + + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset rbx, -48 + pushq %rbp +.cfi_offset rbp, -56 +.cfi_adjust_cfa_offset 8 + subq $96,%rsp +.cfi_adjust_cfa_offset 96 + + addq 32(%rdi),%r8 + adcq 40(%rdi),%r9 + adcq 48(%rdi),%r10 + adcq $0x0,%r11 + sbbq $0x0,%rax + movq %r8,0(%rsp) + movq %r9,8(%rsp) + movq %r10,16(%rsp) + movq %r11,24(%rsp) + + + xorq %rbx,%rbx + movq 0(%rsi),%r12 + movq 8(%rsi),%r13 + movq 16(%rsi),%r14 + movq 24(%rsi),%r15 + addq 32(%rsi),%r12 + adcq 40(%rsi),%r13 + adcq 48(%rsi),%r14 + adcq $0x0,%r15 + sbbq $0x0,%rbx + movq %r12,32(%rsp) + movq %r13,40(%rsp) + movq %r14,48(%rsp) + movq %r15,56(%rsp) + + + andq %rax,%r12 + andq %rax,%r13 + andq %rax,%r14 + andq %rax,%r15 + + + andq %rbx,%r8 + andq %rbx,%r9 + andq %rbx,%r10 + andq %rbx,%r11 + + + addq %r12,%r8 + adcq %r13,%r9 + adcq %r14,%r10 + adcq %r15,%r11 + movq %r8,64(%rsp) + movq %r9,72(%rsp) + movq %r10,80(%rsp) + movq %r11,88(%rsp) + + + movq 0+0(%rsp),%rdx + mulxq 32+0(%rsp),%r9,%r8 + movq %r9,0+0(%rsp) + mulxq 32+8(%rsp),%r10,%r9 + xorq %rax,%rax + adoxq %r10,%r8 + mulxq 32+16(%rsp),%r11,%r10 + adoxq %r11,%r9 + mulxq 32+24(%rsp),%r12,%r11 + adoxq %r12,%r10 + + movq 0+8(%rsp),%rdx + mulxq 32+0(%rsp),%r12,%r13 + adoxq %rax,%r11 + xorq %rax,%rax + mulxq 32+8(%rsp),%r15,%r14 + adoxq %r8,%r12 + movq %r12,0+8(%rsp) + adcxq %r15,%r13 + mulxq 32+16(%rsp),%rbx,%r15 + adcxq %rbx,%r14 + adoxq %r9,%r13 + mulxq 32+24(%rsp),%rbp,%rbx + adcxq %rbp,%r15 + adcxq %rax,%rbx + adoxq %r10,%r14 + + movq 0+16(%rsp),%rdx + mulxq 32+0(%rsp),%r8,%r9 + adoxq %r11,%r15 + adoxq %rax,%rbx + xorq %rax,%rax + mulxq 32+8(%rsp),%r11,%r10 + adoxq %r13,%r8 + movq %r8,0+16(%rsp) + adcxq %r11,%r9 + mulxq 32+16(%rsp),%r12,%r11 + adcxq %r12,%r10 + adoxq %r14,%r9 + mulxq 32+24(%rsp),%rbp,%r12 + adcxq %rbp,%r11 + adcxq %rax,%r12 + + adoxq %r15,%r10 + adoxq %rbx,%r11 + adoxq %rax,%r12 + + movq 0+24(%rsp),%rdx + mulxq 32+0(%rsp),%r8,%r13 + xorq %rax,%rax + mulxq 32+8(%rsp),%r15,%r14 + adcxq %r15,%r13 + adoxq %r8,%r9 + mulxq 32+16(%rsp),%rbx,%r15 + adcxq %rbx,%r14 + adoxq %r13,%r10 + mulxq 32+24(%rsp),%rbp,%rbx + adcxq %rbp,%r15 + adcxq %rax,%rbx + adoxq %r14,%r11 + adoxq %r15,%r12 + adoxq %rax,%rbx + movq %r9,0+24(%rsp) + movq %r10,0+32(%rsp) + movq %r11,0+40(%rsp) + movq %r12,0+48(%rsp) + movq %rbx,0+56(%rsp) + + + + movq 0+0(%rdi),%rdx + mulxq 0+0(%rsi),%r9,%r8 + movq %r9,0+0(%rcx) + mulxq 0+8(%rsi),%r10,%r9 + xorq %rax,%rax + adoxq %r10,%r8 + mulxq 0+16(%rsi),%r11,%r10 + adoxq %r11,%r9 + mulxq 0+24(%rsi),%r12,%r11 + adoxq %r12,%r10 + + movq 0+8(%rdi),%rdx + mulxq 0+0(%rsi),%r12,%r13 + adoxq %rax,%r11 + xorq %rax,%rax + mulxq 0+8(%rsi),%r15,%r14 + adoxq %r8,%r12 + movq %r12,0+8(%rcx) + adcxq %r15,%r13 + mulxq 0+16(%rsi),%rbx,%r15 + adcxq %rbx,%r14 + adoxq %r9,%r13 + mulxq 0+24(%rsi),%rbp,%rbx + adcxq %rbp,%r15 + adcxq %rax,%rbx + adoxq %r10,%r14 + + movq 0+16(%rdi),%rdx + mulxq 0+0(%rsi),%r8,%r9 + adoxq %r11,%r15 + adoxq %rax,%rbx + xorq %rax,%rax + mulxq 0+8(%rsi),%r11,%r10 + adoxq %r13,%r8 + movq %r8,0+16(%rcx) + adcxq %r11,%r9 + mulxq 0+16(%rsi),%r12,%r11 + adcxq %r12,%r10 + adoxq %r14,%r9 + mulxq 0+24(%rsi),%rbp,%r12 + adcxq %rbp,%r11 + adcxq %rax,%r12 + + adoxq %r15,%r10 + adoxq %rbx,%r11 + adoxq %rax,%r12 + + movq 0+24(%rdi),%rdx + mulxq 0+0(%rsi),%r8,%r13 + xorq %rax,%rax + mulxq 0+8(%rsi),%r15,%r14 + adcxq %r15,%r13 + adoxq %r8,%r9 + mulxq 0+16(%rsi),%rbx,%r15 + adcxq %rbx,%r14 + adoxq %r13,%r10 + mulxq 0+24(%rsi),%rbp,%rbx + adcxq %rbp,%r15 + adcxq %rax,%rbx + adoxq %r14,%r11 + adoxq %r15,%r12 + adoxq %rax,%rbx + movq %r9,0+24(%rcx) + movq %r10,0+32(%rcx) + movq %r11,0+40(%rcx) + movq %r12,0+48(%rcx) + movq %rbx,0+56(%rcx) + + + + movq 32+0(%rdi),%rdx + mulxq 32+0(%rsi),%r9,%r8 + movq %r9,64+0(%rcx) + mulxq 32+8(%rsi),%r10,%r9 + xorq %rax,%rax + adoxq %r10,%r8 + mulxq 32+16(%rsi),%r11,%r10 + adoxq %r11,%r9 + + movq 32+8(%rdi),%rdx + mulxq 32+0(%rsi),%r12,%r11 + adoxq %rax,%r10 + xorq %rax,%rax + + mulxq 32+8(%rsi),%r14,%r13 + adoxq %r8,%r12 + movq %r12,64+8(%rcx) + adcxq %r14,%r11 + + mulxq 32+16(%rsi),%r8,%r14 + adoxq %r9,%r11 + adcxq %r8,%r13 + adcxq %rax,%r14 + adoxq %r10,%r13 + + movq 32+16(%rdi),%rdx + mulxq 32+0(%rsi),%r8,%r9 + adoxq %rax,%r14 + xorq %rax,%rax + + mulxq 32+8(%rsi),%r10,%r12 + adoxq %r11,%r8 + movq %r8,64+16(%rcx) + adcxq %r13,%r9 + + mulxq 32+16(%rsi),%r11,%r8 + adcxq %r14,%r12 + adcxq %rax,%r8 + adoxq %r10,%r9 + adoxq %r12,%r11 + adoxq %rax,%r8 + movq %r9,64+24(%rcx) + movq %r11,64+32(%rcx) + movq %r8,64+40(%rcx) + + + + + movq 64(%rsp),%r8 + movq 72(%rsp),%r9 + movq 80(%rsp),%r10 + movq 88(%rsp),%r11 + + movq 32(%rsp),%rax + addq %rax,%r8 + movq 40(%rsp),%rax + adcq %rax,%r9 + movq 48(%rsp),%rax + adcq %rax,%r10 + movq 56(%rsp),%rax + adcq %rax,%r11 + + + movq 0(%rsp),%r12 + movq 8(%rsp),%r13 + movq 16(%rsp),%r14 + movq 24(%rsp),%r15 + subq 0(%rcx),%r12 + sbbq 8(%rcx),%r13 + sbbq 16(%rcx),%r14 + sbbq 24(%rcx),%r15 + sbbq 32(%rcx),%r8 + sbbq 40(%rcx),%r9 + sbbq 48(%rcx),%r10 + sbbq 56(%rcx),%r11 + + + subq 64(%rcx),%r12 + sbbq 72(%rcx),%r13 + sbbq 80(%rcx),%r14 + sbbq 88(%rcx),%r15 + sbbq 96(%rcx),%r8 + sbbq 104(%rcx),%r9 + sbbq $0x0,%r10 + sbbq $0x0,%r11 + + addq 32(%rcx),%r12 + movq %r12,32(%rcx) + adcq 40(%rcx),%r13 + movq %r13,40(%rcx) + adcq 48(%rcx),%r14 + movq %r14,48(%rcx) + adcq 56(%rcx),%r15 + movq %r15,56(%rcx) + adcq 64(%rcx),%r8 + movq %r8,64(%rcx) + adcq 72(%rcx),%r9 + movq %r9,72(%rcx) + adcq 80(%rcx),%r10 + movq %r10,80(%rcx) + adcq 88(%rcx),%r11 + movq %r11,88(%rcx) + movq 96(%rcx),%r12 + adcq $0x0,%r12 + movq %r12,96(%rcx) + movq 104(%rcx),%r13 + adcq $0x0,%r13 + movq %r13,104(%rcx) + + addq $96,%rsp +.cfi_adjust_cfa_offset -96 + popq %rbp +.cfi_adjust_cfa_offset -8 +.cfi_same_value rbp + popq %rbx +.cfi_adjust_cfa_offset -8 +.cfi_same_value rbx + + + popq %r15 +.cfi_adjust_cfa_offset -8 + popq %r14 +.cfi_adjust_cfa_offset -8 + popq %r13 +.cfi_adjust_cfa_offset -8 + popq %r12 +.cfi_adjust_cfa_offset -8 + .byte 0xf3,0xc3 +.cfi_endproc +#endif diff --git a/src/kem/sike/sike-p434-sha256/asm/fp_generic.c b/src/kem/sike/sike-p434-sha256/asm/fp_generic.c new file mode 100644 index 00000000..38e7645e --- /dev/null +++ b/src/kem/sike/sike-p434-sha256/asm/fp_generic.c @@ -0,0 +1,179 @@ +/******************************************************************************************** +* SIDH: an efficient supersingular isogeny cryptography library +* +* Abstract: portable modular arithmetic for P503 +*********************************************************************************************/ + +#if defined(ARCH_GENERIC) || \ + (!defined(ARCH_X86_64) && !defined(ARCH_AARCH64)) + +#include "../utils.h" +#include "../fpx.h" + +// Global constants +extern const struct params_t params; + +static void digit_x_digit(const crypto_word_t a, const crypto_word_t b, crypto_word_t* c) +{ // Digit multiplication, digit * digit -> 2-digit result + crypto_word_t al, ah, bl, bh, temp; + crypto_word_t albl, albh, ahbl, ahbh, res1, res2, res3, carry; + crypto_word_t mask_low = (crypto_word_t)(-1) >> (sizeof(crypto_word_t)*4); + crypto_word_t mask_high = (crypto_word_t)(-1) << (sizeof(crypto_word_t)*4); + + al = a & mask_low; // Low part + ah = a >> (sizeof(crypto_word_t) * 4); // High part + bl = b & mask_low; + bh = b >> (sizeof(crypto_word_t) * 4); + + albl = al*bl; + albh = al*bh; + ahbl = ah*bl; + ahbh = ah*bh; + c[0] = albl & mask_low; // C00 + + res1 = albl >> (sizeof(crypto_word_t) * 4); + res2 = ahbl & mask_low; + res3 = albh & mask_low; + temp = res1 + res2 + res3; + carry = temp >> (sizeof(crypto_word_t) * 4); + c[0] ^= temp << (sizeof(crypto_word_t) * 4); // C01 + + res1 = ahbl >> (sizeof(crypto_word_t) * 4); + res2 = albh >> (sizeof(crypto_word_t) * 4); + res3 = ahbh & mask_low; + temp = res1 + res2 + res3 + carry; + c[1] = temp & mask_low; // C10 + carry = temp & mask_high; + c[1] ^= (ahbh & mask_high) + carry; // C11 +} + +void sike_fpadd(const felm_t a, const felm_t b, felm_t c) +{ // Modular addition, c = a+b mod p434. + // Inputs: a, b in [0, 2*p434-1] + // Output: c in [0, 2*p434-1] + unsigned int i, carry = 0; + crypto_word_t mask; + + for (i = 0; i < NWORDS_FIELD; i++) { + ADDC(carry, a[i], b[i], carry, c[i]); + } + + carry = 0; + for (i = 0; i < NWORDS_FIELD; i++) { + SUBC(carry, c[i], params.prime_x2[i], carry, c[i]); + } + mask = 0 - (crypto_word_t)carry; + + carry = 0; + for (i = 0; i < NWORDS_FIELD; i++) { + ADDC(carry, c[i], params.prime_x2[i] & mask, carry, c[i]); + } +} + +void sike_fpsub(const felm_t a, const felm_t b, felm_t c) +{ // Modular subtraction, c = a-b mod p434. + // Inputs: a, b in [0, 2*p434-1] + // Output: c in [0, 2*p434-1] + unsigned int i, borrow = 0; + crypto_word_t mask; + + for (i = 0; i < NWORDS_FIELD; i++) { + SUBC(borrow, a[i], b[i], borrow, c[i]); + } + mask = 0 - (crypto_word_t)borrow; + + borrow = 0; + for (i = 0; i < NWORDS_FIELD; i++) { + ADDC(borrow, c[i], params.prime_x2[i] & mask, borrow, c[i]); + } +} + +void sike_mpmul(const felm_t a, const felm_t b, dfelm_t c) +{ // Multiprecision comba multiply, c = a*b, where lng(a) = lng(b) = NWORDS_FIELD. + unsigned int i, j; + crypto_word_t t = 0, u = 0, v = 0, UV[2]; + unsigned int carry = 0; + + for (i = 0; i < NWORDS_FIELD; i++) { + for (j = 0; j <= i; j++) { + MUL(a[j], b[i-j], UV+1, UV[0]); + ADDC(0, UV[0], v, carry, v); + ADDC(carry, UV[1], u, carry, u); + t += carry; + } + c[i] = v; + v = u; + u = t; + t = 0; + } + + for (i = NWORDS_FIELD; i < 2*NWORDS_FIELD-1; i++) { + for (j = i-NWORDS_FIELD+1; j < NWORDS_FIELD; j++) { + MUL(a[j], b[i-j], UV+1, UV[0]); + ADDC(0, UV[0], v, carry, v); + ADDC(carry, UV[1], u, carry, u); + t += carry; + } + c[i] = v; + v = u; + u = t; + t = 0; + } + c[2*NWORDS_FIELD-1] = v; +} + +void sike_fprdc(const felm_t ma, felm_t mc) +{ // Efficient Montgomery reduction using comba and exploiting the special form of the prime p434. + // mc = ma*R^-1 mod p434x2, where R = 2^448. + // If ma < 2^448*p434, the output mc is in the range [0, 2*p434-1]. + // ma is assumed to be in Montgomery representation. + unsigned int i, j, carry, count = ZERO_WORDS; + crypto_word_t UV[2], t = 0, u = 0, v = 0; + + for (i = 0; i < NWORDS_FIELD; i++) { + mc[i] = 0; + } + + for (i = 0; i < NWORDS_FIELD; i++) { + for (j = 0; j < i; j++) { + if (j < (i-ZERO_WORDS+1)) { + MUL(mc[j], params.prime_p1[i-j], UV+1, UV[0]); + ADDC(0, UV[0], v, carry, v); + ADDC(carry, UV[1], u, carry, u); + t += carry; + } + } + ADDC(0, v, ma[i], carry, v); + ADDC(carry, u, 0, carry, u); + t += carry; + mc[i] = v; + v = u; + u = t; + t = 0; + } + + for (i = NWORDS_FIELD; i < 2*NWORDS_FIELD-1; i++) { + if (count > 0) { + count -= 1; + } + for (j = i-NWORDS_FIELD+1; j < NWORDS_FIELD; j++) { + if (j < (NWORDS_FIELD-count)) { + MUL(mc[j], params.prime_p1[i-j], UV+1, UV[0]); + ADDC(0, UV[0], v, carry, v); + ADDC(carry, UV[1], u, carry, u); + t += carry; + } + } + ADDC(0, v, ma[i], carry, v); + ADDC(carry, u, 0, carry, u); + t += carry; + mc[i-NWORDS_FIELD] = v; + v = u; + u = t; + t = 0; + } + ADDC(0, v, ma[2*NWORDS_FIELD-1], carry, v); + mc[NWORDS_FIELD-1] = v; +} + +#endif // NO_ASM || (!X86_64 && !AARCH64) diff --git a/src/kem/sike/sike-p434-sha256/fpx.c b/src/kem/sike/sike-p434-sha256/fpx.c new file mode 100644 index 00000000..30233406 --- /dev/null +++ b/src/kem/sike/sike-p434-sha256/fpx.c @@ -0,0 +1,282 @@ +/******************************************************************************************** +* SIDH: an efficient supersingular isogeny cryptography library +* +* Abstract: core functions over GF(p) and GF(p^2) +*********************************************************************************************/ +#include +#include "utils.h" +#include "fpx.h" + +extern const struct params_t params; + +// Multiprecision squaring, c = a^2 mod p. +static void fpsqr_mont(const felm_t ma, felm_t mc) +{ + dfelm_t temp = {0}; + sike_mpmul(ma, ma, temp); + sike_fprdc(temp, mc); +} + +// Chain to compute a^(p-3)/4 using Montgomery arithmetic. +static void fpinv_chain_mont(felm_t a) +{ + unsigned int i, j; + felm_t t[31], tt; + + // Precomputed table + fpsqr_mont(a, tt); + sike_fpmul_mont(a, tt, t[0]); + for (i = 0; i <= 29; i++) sike_fpmul_mont(t[i], tt, t[i+1]); + + sike_fpcopy(a, tt); + for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[5], tt, tt); + for (i = 0; i < 10; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[14], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[3], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[23], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[13], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[24], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[7], tt, tt); + for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[12], tt, tt); + for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[30], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[1], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[30], tt, tt); + for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[21], tt, tt); + for (i = 0; i < 9; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[2], tt, tt); + for (i = 0; i < 9; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[19], tt, tt); + for (i = 0; i < 9; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[1], tt, tt); + for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[24], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[26], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[16], tt, tt); + for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[10], tt, tt); + for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[6], tt, tt); + for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[0], tt, tt); + for (i = 0; i < 9; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[20], tt, tt); + for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[9], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[25], tt, tt); + for (i = 0; i < 9; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[30], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[26], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(a, tt, tt); + for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[28], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[6], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[10], tt, tt); + for (i = 0; i < 9; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[22], tt, tt); + for (j = 0; j < 35; j++) { + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[30], tt, tt); + } + sike_fpcopy(tt, a); +} + +// Field inversion using Montgomery arithmetic, a = a^(-1)*R mod p. +static void fpinv_mont(felm_t a) +{ + felm_t tt = {0}; + sike_fpcopy(a, tt); + fpinv_chain_mont(tt); + fpsqr_mont(tt, tt); + fpsqr_mont(tt, tt); + sike_fpmul_mont(a, tt, a); +} + +// Multiprecision addition, c = a+b, where lng(a) = lng(b) = nwords. Returns the carry bit. +#if defined(ARCH_GENERIC) || (!defined(ARCH_X86_64) && !defined(ARCH_AARCH64)) +inline static unsigned int mp_add(const felm_t a, const felm_t b, felm_t c, const unsigned int nwords) { + uint8_t carry = 0; + for (size_t i = 0; i < nwords; i++) { + ADDC(carry, a[i], b[i], carry, c[i]); + } + return carry; +} + +// Multiprecision subtraction, c = a-b, where lng(a) = lng(b) = nwords. Returns the borrow bit. +inline static unsigned int mp_sub(const felm_t a, const felm_t b, felm_t c, const unsigned int nwords) { + uint32_t borrow = 0; + for (size_t i = 0; i < nwords; i++) { + SUBC(borrow, a[i], b[i], borrow, c[i]); + } + return borrow; +} +#endif + +// Multiprecision addition, c = a+b. +inline static void mp_addfast(const felm_t a, const felm_t b, felm_t c) +{ +#if defined(ARCH_GENERIC) || (!defined(ARCH_X86_64) && !defined(ARCH_AARCH64)) + mp_add(a, b, c, NWORDS_FIELD); +#else + sike_mpadd_asm(a, b, c); +#endif +} + +// Multiprecision subtraction, c = a-b, where lng(a) = lng(b) = 2*NWORDS_FIELD. +// If c < 0 then returns mask = 0xFF..F, else mask = 0x00..0 +inline static crypto_word_t mp_subfast(const dfelm_t a, const dfelm_t b, dfelm_t c) { +#if defined(ARCH_GENERIC) || (!defined(ARCH_X86_64) && !defined(ARCH_AARCH64)) + return (0 - (crypto_word_t)mp_sub(a, b, c, 2*NWORDS_FIELD)); +#else + return sike_mpsubx2_asm(a, b, c); +#endif +} + +// Multiprecision subtraction, c = c-a-b, where lng(a) = lng(b) = 2*NWORDS_FIELD. +// Inputs should be s.t. c > a and c > b +inline static void mp_dblsubfast(const dfelm_t a, const dfelm_t b, dfelm_t c) { +#if defined(ARCH_GENERIC) || (!defined(ARCH_X86_64) && !defined(ARCH_AARCH64)) + mp_sub(c, a, c, 2*NWORDS_FIELD); + mp_sub(c, b, c, 2*NWORDS_FIELD); +#else + sike_mpdblsubx2_asm(a, b, c); +#endif +} + +// Copy a field element, c = a. +void sike_fpcopy(const felm_t a, felm_t c) { + for (size_t i = 0; i < NWORDS_FIELD; i++) { + c[i] = a[i]; + } +} + +// Field multiplication using Montgomery arithmetic, c = a*b*R^-1 mod prime, where R=2^768 +void sike_fpmul_mont(const felm_t ma, const felm_t mb, felm_t mc) +{ + dfelm_t temp = {0}; + sike_mpmul(ma, mb, temp); + sike_fprdc(temp, mc); +} + +// Conversion from Montgomery representation to standard representation, +// c = ma*R^(-1) mod p = a mod p, where ma in [0, p-1]. +void sike_from_mont(const felm_t ma, felm_t c) +{ + felm_t one = {0}; + one[0] = 1; + + sike_fpmul_mont(ma, one, c); + sike_fpcorrection(c); +} + +// GF(p^2) squaring using Montgomery arithmetic, c = a^2 in GF(p^2). +// Inputs: a = a0+a1*i, where a0, a1 are in [0, 2*p-1] +// Output: c = c0+c1*i, where c0, c1 are in [0, 2*p-1] +void sike_fp2sqr_mont(const f2elm_t a, f2elm_t c) { + felm_t t1, t2, t3; + + mp_addfast(a->c0, a->c1, t1); // t1 = a0+a1 + sike_fpsub(a->c0, a->c1, t2); // t2 = a0-a1 + mp_addfast(a->c0, a->c0, t3); // t3 = 2a0 + sike_fpmul_mont(t1, t2, c->c0); // c0 = (a0+a1)(a0-a1) + sike_fpmul_mont(t3, a->c1, c->c1); // c1 = 2a0*a1 +} + +// Modular negation, a = -a mod p503. +// Input/output: a in [0, 2*p503-1] +void sike_fpneg(felm_t a) { + uint32_t borrow = 0; + for (size_t i = 0; i < NWORDS_FIELD; i++) { + SUBC(borrow, params.prime_x2[i], a[i], borrow, a[i]); + } +} + +// Modular division by two, c = a/2 mod p503. +// Input : a in [0, 2*p503-1] +// Output: c in [0, 2*p503-1] +void sike_fpdiv2(const felm_t a, felm_t c) { + uint32_t carry = 0; + crypto_word_t mask; + + mask = 0 - (crypto_word_t)(a[0] & 1); // If a is odd compute a+p503 + for (size_t i = 0; i < NWORDS_FIELD; i++) { + ADDC(carry, a[i], params.prime[i] & mask, carry, c[i]); + } + + // Multiprecision right shift by one. + for (size_t i = 0; i < NWORDS_FIELD-1; i++) { + c[i] = (c[i] >> 1) ^ (c[i+1] << (RADIX - 1)); + } + c[NWORDS_FIELD-1] >>= 1; +} + +// Modular correction to reduce field element a in [0, 2*p503-1] to [0, p503-1]. +void sike_fpcorrection(felm_t a) { + uint32_t borrow = 0; + crypto_word_t mask; + + for (size_t i = 0; i < NWORDS_FIELD; i++) { + SUBC(borrow, a[i], params.prime[i], borrow, a[i]); + } + mask = 0 - (crypto_word_t)borrow; + + borrow = 0; + for (size_t i = 0; i < NWORDS_FIELD; i++) { + ADDC(borrow, a[i], params.prime[i] & mask, borrow, a[i]); + } +} + +// GF(p^2) multiplication using Montgomery arithmetic, c = a*b in GF(p^2). +// Inputs: a = a0+a1*i and b = b0+b1*i, where a0, a1, b0, b1 are in [0, 2*p-1] +// Output: c = c0+c1*i, where c0, c1 are in [0, 2*p-1] +void sike_fp2mul_mont(const f2elm_t a, const f2elm_t b, f2elm_t c) { + felm_t t1, t2; + dfelm_t tt1, tt2, tt3; + crypto_word_t mask; + + mp_addfast(a->c0, a->c1, t1); // t1 = a0+a1 + mp_addfast(b->c0, b->c1, t2); // t2 = b0+b1 + sike_mpmul(a->c0, b->c0, tt1); // tt1 = a0*b0 + sike_mpmul(a->c1, b->c1, tt2); // tt2 = a1*b1 + sike_mpmul(t1, t2, tt3); // tt3 = (a0+a1)*(b0+b1) + mp_dblsubfast(tt1, tt2, tt3); // tt3 = (a0+a1)*(b0+b1) - a0*b0 - a1*b1 + mask = mp_subfast(tt1, tt2, tt1); // tt1 = a0*b0 - a1*b1. If tt1 < 0 then mask = 0xFF..F, else if tt1 >= 0 then mask = 0x00..0 + + for (size_t i = 0; i < NWORDS_FIELD; i++) { + t1[i] = params.prime[i] & mask; + } + + sike_fprdc(tt3, c->c1); // c[1] = (a0+a1)*(b0+b1) - a0*b0 - a1*b1 + mp_addfast(&tt1[NWORDS_FIELD], t1, &tt1[NWORDS_FIELD]); + sike_fprdc(tt1, c->c0); // c[0] = a0*b0 - a1*b1 +} + +// GF(p^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2). +void sike_fp2inv_mont(f2elm_t a) { + f2elm_t t1; + + fpsqr_mont(a->c0, t1->c0); // t10 = a0^2 + fpsqr_mont(a->c1, t1->c1); // t11 = a1^2 + sike_fpadd(t1->c0, t1->c1, t1->c0); // t10 = a0^2+a1^2 + fpinv_mont(t1->c0); // t10 = (a0^2+a1^2)^-1 + sike_fpneg(a->c1); // a = a0-i*a1 + sike_fpmul_mont(a->c0, t1->c0, a->c0); + sike_fpmul_mont(a->c1, t1->c0, a->c1); // a = (a0-i*a1)*(a0^2+a1^2)^-1 +} diff --git a/src/kem/sike/sike-p434-sha256/fpx.h b/src/kem/sike/sike-p434-sha256/fpx.h new file mode 100644 index 00000000..b9255ac7 --- /dev/null +++ b/src/kem/sike/sike-p434-sha256/fpx.h @@ -0,0 +1,112 @@ +#ifndef FPX_H_ +#define FPX_H_ + +#include "utils.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +// Modular addition, c = a+b mod p. +void sike_fpadd(const felm_t a, const felm_t b, felm_t c); +// Modular subtraction, c = a-b mod p. +void sike_fpsub(const felm_t a, const felm_t b, felm_t c); +// Modular division by two, c = a/2 mod p. +void sike_fpdiv2(const felm_t a, felm_t c); +// Modular correction to reduce field element a in [0, 2*p-1] to [0, p-1]. +void sike_fpcorrection(felm_t a); +// Multiprecision multiply, c = a*b, where lng(a) = lng(b) = nwords. +void sike_mpmul(const felm_t a, const felm_t b, dfelm_t c); +// 443-bit Montgomery reduction, c = a mod p +void sike_fprdc(const dfelm_t a, felm_t c); +// Double 2x443-bit multiprecision subtraction, c = c-a-b +void sike_mpdblsubx2_asm(const felm_t a, const felm_t b, felm_t c); +// Multiprecision subtraction, c = a-b +crypto_word_t sike_mpsubx2_asm(const dfelm_t a, const dfelm_t b, dfelm_t c); +// 443-bit multiprecision addition, c = a+b +void sike_mpadd_asm(const felm_t a, const felm_t b, felm_t c); +// Modular negation, a = -a mod p. +void sike_fpneg(felm_t a); +// Copy of a field element, c = a +void sike_fpcopy(const felm_t a, felm_t c); +// Copy a field element, c = a. +void sike_fpzero(felm_t a); +// If option = 0xFF...FF x=y; y=x, otherwise swap doesn't happen. Constant time. +void sike_cswap_asm(point_proj_t x, point_proj_t y, const crypto_word_t option); +// Conversion from Montgomery representation to standard representation, +// c = ma*R^(-1) mod p = a mod p, where ma in [0, p-1]. +void sike_from_mont(const felm_t ma, felm_t c); +// Field multiplication using Montgomery arithmetic, c = a*b*R^-1 mod p443, where R=2^768 +void sike_fpmul_mont(const felm_t ma, const felm_t mb, felm_t mc); +// GF(p443^2) multiplication using Montgomery arithmetic, c = a*b in GF(p443^2) +void sike_fp2mul_mont(const f2elm_t a, const f2elm_t b, f2elm_t c); +// GF(p443^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2) +void sike_fp2inv_mont(f2elm_t a); +// GF(p^2) squaring using Montgomery arithmetic, c = a^2 in GF(p^2). +void sike_fp2sqr_mont(const f2elm_t a, f2elm_t c); +// Modular correction, a = a in GF(p^2). +void sike_fp2correction(f2elm_t a); + +#if defined(__cplusplus) +} // extern C +#endif + +// GF(p^2) addition, c = a+b in GF(p^2). +#define sike_fp2add(a, b, c) \ +do { \ + sike_fpadd(a->c0, b->c0, c->c0); \ + sike_fpadd(a->c1, b->c1, c->c1); \ +} while(0) + +// GF(p^2) subtraction, c = a-b in GF(p^2). +#define sike_fp2sub(a,b,c) \ +do { \ + sike_fpsub(a->c0, b->c0, c->c0); \ + sike_fpsub(a->c1, b->c1, c->c1); \ +} while(0) + +// Copy a GF(p^2) element, c = a. +#define sike_fp2copy(a, c) \ +do { \ + sike_fpcopy(a->c0, c->c0); \ + sike_fpcopy(a->c1, c->c1); \ +} while(0) + +// GF(p^2) negation, a = -a in GF(p^2). +#define sike_fp2neg(a) \ +do { \ + sike_fpneg(a->c0); \ + sike_fpneg(a->c1); \ +} while(0) + +// GF(p^2) division by two, c = a/2 in GF(p^2). +#define sike_fp2div2(a, c) \ +do { \ + sike_fpdiv2(a->c0, c->c0); \ + sike_fpdiv2(a->c1, c->c1); \ +} while(0) + +// Modular correction, a = a in GF(p^2). +#define sike_fp2correction(a) \ +do { \ + sike_fpcorrection(a->c0); \ + sike_fpcorrection(a->c1); \ +} while(0) + +// Conversion of a GF(p^2) element to Montgomery representation, +// mc_i = a_i*R^2*R^(-1) = a_i*R in GF(p^2). +#define sike_to_fp2mont(a, mc) \ +do { \ + sike_fpmul_mont(a->c0, params.mont_R2, mc->c0); \ + sike_fpmul_mont(a->c1, params.mont_R2, mc->c1); \ +} while(0) + +// Conversion of a GF(p^2) element from Montgomery representation to standard representation, +// c_i = ma_i*R^(-1) = a_i in GF(p^2). +#define sike_from_fp2mont(ma, c) \ +do { \ + sike_from_mont(ma->c0, c->c0); \ + sike_from_mont(ma->c1, c->c1); \ +} while(0) + +#endif // FPX_H_ diff --git a/src/kem/sike/sike-p434-sha256/isogeny.c b/src/kem/sike/sike-p434-sha256/isogeny.c new file mode 100644 index 00000000..661410e4 --- /dev/null +++ b/src/kem/sike/sike-p434-sha256/isogeny.c @@ -0,0 +1,262 @@ +/******************************************************************************************** +* SIDH: an efficient supersingular isogeny cryptography library +* +* Abstract: elliptic curve and isogeny functions +*********************************************************************************************/ +#include +#include +#include "utils.h" +#include "isogeny.h" +#include "fpx.h" + +static void xDBL(const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, const f2elm_t C24) +{ // Doubling of a Montgomery point in projective coordinates (X:Z). + // Input: projective Montgomery x-coordinates P = (X1:Z1), where x1=X1/Z1 and Montgomery curve constants A+2C and 4C. + // Output: projective Montgomery x-coordinates Q = 2*P = (X2:Z2). + f2elm_t t0, t1; + + sike_fp2sub(P->X, P->Z, t0); // t0 = X1-Z1 + sike_fp2add(P->X, P->Z, t1); // t1 = X1+Z1 + sike_fp2sqr_mont(t0, t0); // t0 = (X1-Z1)^2 + sike_fp2sqr_mont(t1, t1); // t1 = (X1+Z1)^2 + sike_fp2mul_mont(C24, t0, Q->Z); // Z2 = C24*(X1-Z1)^2 + sike_fp2mul_mont(t1, Q->Z, Q->X); // X2 = C24*(X1-Z1)^2*(X1+Z1)^2 + sike_fp2sub(t1, t0, t1); // t1 = (X1+Z1)^2-(X1-Z1)^2 + sike_fp2mul_mont(A24plus, t1, t0); // t0 = A24plus*[(X1+Z1)^2-(X1-Z1)^2] + sike_fp2add(Q->Z, t0, Q->Z); // Z2 = A24plus*[(X1+Z1)^2-(X1-Z1)^2] + C24*(X1-Z1)^2 + sike_fp2mul_mont(Q->Z, t1, Q->Z); // Z2 = [A24plus*[(X1+Z1)^2-(X1-Z1)^2] + C24*(X1-Z1)^2]*[(X1+Z1)^2-(X1-Z1)^2] +} + +void xDBLe(const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, const f2elm_t C24, size_t e) +{ // Computes [2^e](X:Z) on Montgomery curve with projective constant via e repeated doublings. + // Input: projective Montgomery x-coordinates P = (XP:ZP), such that xP=XP/ZP and Montgomery curve constants A+2C and 4C. + // Output: projective Montgomery x-coordinates Q <- (2^e)*P. + + memmove(Q, P, sizeof(*P)); + for (size_t i = 0; i < e; i++) { + xDBL(Q, Q, A24plus, C24); + } +} + +void get_4_isog(const point_proj_t P, f2elm_t A24plus, f2elm_t C24, f2elm_t* coeff) +{ // Computes the corresponding 4-isogeny of a projective Montgomery point (X4:Z4) of order 4. + // Input: projective point of order four P = (X4:Z4). + // Output: the 4-isogenous Montgomery curve with projective coefficients A+2C/4C and the 3 coefficients + // that are used to evaluate the isogeny at a point in eval_4_isog(). + + sike_fp2sub(P->X, P->Z, coeff[1]); // coeff[1] = X4-Z4 + sike_fp2add(P->X, P->Z, coeff[2]); // coeff[2] = X4+Z4 + sike_fp2sqr_mont(P->Z, coeff[0]); // coeff[0] = Z4^2 + sike_fp2add(coeff[0], coeff[0], coeff[0]); // coeff[0] = 2*Z4^2 + sike_fp2sqr_mont(coeff[0], C24); // C24 = 4*Z4^4 + sike_fp2add(coeff[0], coeff[0], coeff[0]); // coeff[0] = 4*Z4^2 + sike_fp2sqr_mont(P->X, A24plus); // A24plus = X4^2 + sike_fp2add(A24plus, A24plus, A24plus); // A24plus = 2*X4^2 + sike_fp2sqr_mont(A24plus, A24plus); // A24plus = 4*X4^4 +} + +void eval_4_isog(point_proj_t P, f2elm_t* coeff) +{ // Evaluates the isogeny at the point (X:Z) in the domain of the isogeny, given a 4-isogeny phi defined + // by the 3 coefficients in coeff (computed in the function get_4_isog()). + // Inputs: the coefficients defining the isogeny, and the projective point P = (X:Z). + // Output: the projective point P = phi(P) = (X:Z) in the codomain. + f2elm_t t0, t1; + + sike_fp2add(P->X, P->Z, t0); // t0 = X+Z + sike_fp2sub(P->X, P->Z, t1); // t1 = X-Z + sike_fp2mul_mont(t0, coeff[1], P->X); // X = (X+Z)*coeff[1] + sike_fp2mul_mont(t1, coeff[2], P->Z); // Z = (X-Z)*coeff[2] + sike_fp2mul_mont(t0, t1, t0); // t0 = (X+Z)*(X-Z) + sike_fp2mul_mont(t0, coeff[0], t0); // t0 = coeff[0]*(X+Z)*(X-Z) + sike_fp2add(P->X, P->Z, t1); // t1 = (X-Z)*coeff[2] + (X+Z)*coeff[1] + sike_fp2sub(P->X, P->Z, P->Z); // Z = (X-Z)*coeff[2] - (X+Z)*coeff[1] + sike_fp2sqr_mont(t1, t1); // t1 = [(X-Z)*coeff[2] + (X+Z)*coeff[1]]^2 + sike_fp2sqr_mont(P->Z, P->Z); // Z = [(X-Z)*coeff[2] - (X+Z)*coeff[1]]^2 + sike_fp2add(t1, t0, P->X); // X = coeff[0]*(X+Z)*(X-Z) + [(X-Z)*coeff[2] + (X+Z)*coeff[1]]^2 + sike_fp2sub(P->Z, t0, t0); // t0 = [(X-Z)*coeff[2] - (X+Z)*coeff[1]]^2 - coeff[0]*(X+Z)*(X-Z) + sike_fp2mul_mont(P->X, t1, P->X); // Xfinal + sike_fp2mul_mont(P->Z, t0, P->Z); // Zfinal +} + + +void xTPL(const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, const f2elm_t A24plus) +{ // Tripling of a Montgomery point in projective coordinates (X:Z). + // Input: projective Montgomery x-coordinates P = (X:Z), where x=X/Z and Montgomery curve constants A24plus = A+2C and A24minus = A-2C. + // Output: projective Montgomery x-coordinates Q = 3*P = (X3:Z3). + f2elm_t t0, t1, t2, t3, t4, t5, t6; + + sike_fp2sub(P->X, P->Z, t0); // t0 = X-Z + sike_fp2sqr_mont(t0, t2); // t2 = (X-Z)^2 + sike_fp2add(P->X, P->Z, t1); // t1 = X+Z + sike_fp2sqr_mont(t1, t3); // t3 = (X+Z)^2 + sike_fp2add(t0, t1, t4); // t4 = 2*X + sike_fp2sub(t1, t0, t0); // t0 = 2*Z + sike_fp2sqr_mont(t4, t1); // t1 = 4*X^2 + sike_fp2sub(t1, t3, t1); // t1 = 4*X^2 - (X+Z)^2 + sike_fp2sub(t1, t2, t1); // t1 = 4*X^2 - (X+Z)^2 - (X-Z)^2 + sike_fp2mul_mont(t3, A24plus, t5); // t5 = A24plus*(X+Z)^2 + sike_fp2mul_mont(t3, t5, t3); // t3 = A24plus*(X+Z)^3 + sike_fp2mul_mont(A24minus, t2, t6); // t6 = A24minus*(X-Z)^2 + sike_fp2mul_mont(t2, t6, t2); // t2 = A24minus*(X-Z)^3 + sike_fp2sub(t2, t3, t3); // t3 = A24minus*(X-Z)^3 - coeff*(X+Z)^3 + sike_fp2sub(t5, t6, t2); // t2 = A24plus*(X+Z)^2 - A24minus*(X-Z)^2 + sike_fp2mul_mont(t1, t2, t1); // t1 = [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] + sike_fp2add(t3, t1, t2); // t2 = [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] + A24minus*(X-Z)^3 - coeff*(X+Z)^3 + sike_fp2sqr_mont(t2, t2); // t2 = t2^2 + sike_fp2mul_mont(t4, t2, Q->X); // X3 = 2*X*t2 + sike_fp2sub(t3, t1, t1); // t1 = A24minus*(X-Z)^3 - A24plus*(X+Z)^3 - [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] + sike_fp2sqr_mont(t1, t1); // t1 = t1^2 + sike_fp2mul_mont(t0, t1, Q->Z); // Z3 = 2*Z*t1 +} + +void xTPLe(const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, const f2elm_t A24plus, size_t e) +{ // Computes [3^e](X:Z) on Montgomery curve with projective constant via e repeated triplings. + // Input: projective Montgomery x-coordinates P = (XP:ZP), such that xP=XP/ZP and Montgomery curve constants A24plus = A+2C and A24minus = A-2C. + // Output: projective Montgomery x-coordinates Q <- (3^e)*P. + memmove(Q, P, sizeof(*P)); + for (size_t i = 0; i < e; i++) { + xTPL(Q, Q, A24minus, A24plus); + } +} + +void get_3_isog(const point_proj_t P, f2elm_t A24minus, f2elm_t A24plus, f2elm_t* coeff) +{ // Computes the corresponding 3-isogeny of a projective Montgomery point (X3:Z3) of order 3. + // Input: projective point of order three P = (X3:Z3). + // Output: the 3-isogenous Montgomery curve with projective coefficient A/C. + f2elm_t t0, t1, t2, t3, t4; + + sike_fp2sub(P->X, P->Z, coeff[0]); // coeff0 = X-Z + sike_fp2sqr_mont(coeff[0], t0); // t0 = (X-Z)^2 + sike_fp2add(P->X, P->Z, coeff[1]); // coeff1 = X+Z + sike_fp2sqr_mont(coeff[1], t1); // t1 = (X+Z)^2 + sike_fp2add(t0, t1, t2); // t2 = (X+Z)^2 + (X-Z)^2 + sike_fp2add(coeff[0], coeff[1], t3); // t3 = 2*X + sike_fp2sqr_mont(t3, t3); // t3 = 4*X^2 + sike_fp2sub(t3, t2, t3); // t3 = 4*X^2 - (X+Z)^2 - (X-Z)^2 + sike_fp2add(t1, t3, t2); // t2 = 4*X^2 - (X-Z)^2 + sike_fp2add(t3, t0, t3); // t3 = 4*X^2 - (X+Z)^2 + sike_fp2add(t0, t3, t4); // t4 = 4*X^2 - (X+Z)^2 + (X-Z)^2 + sike_fp2add(t4, t4, t4); // t4 = 2(4*X^2 - (X+Z)^2 + (X-Z)^2) + sike_fp2add(t1, t4, t4); // t4 = 8*X^2 - (X+Z)^2 + 2*(X-Z)^2 + sike_fp2mul_mont(t2, t4, A24minus); // A24minus = [4*X^2 - (X-Z)^2]*[8*X^2 - (X+Z)^2 + 2*(X-Z)^2] + sike_fp2add(t1, t2, t4); // t4 = 4*X^2 + (X+Z)^2 - (X-Z)^2 + sike_fp2add(t4, t4, t4); // t4 = 2(4*X^2 + (X+Z)^2 - (X-Z)^2) + sike_fp2add(t0, t4, t4); // t4 = 8*X^2 + 2*(X+Z)^2 - (X-Z)^2 + sike_fp2mul_mont(t3, t4, t4); // t4 = [4*X^2 - (X+Z)^2]*[8*X^2 + 2*(X+Z)^2 - (X-Z)^2] + sike_fp2sub(t4, A24minus, t0); // t0 = [4*X^2 - (X+Z)^2]*[8*X^2 + 2*(X+Z)^2 - (X-Z)^2] - [4*X^2 - (X-Z)^2]*[8*X^2 - (X+Z)^2 + 2*(X-Z)^2] + sike_fp2add(A24minus, t0, A24plus); // A24plus = 8*X^2 - (X+Z)^2 + 2*(X-Z)^2 +} + + +void eval_3_isog(point_proj_t Q, f2elm_t* coeff) +{ // Computes the 3-isogeny R=phi(X:Z), given projective point (X3:Z3) of order 3 on a Montgomery curve and + // a point P with 2 coefficients in coeff (computed in the function get_3_isog()). + // Inputs: projective points P = (X3:Z3) and Q = (X:Z). + // Output: the projective point Q <- phi(Q) = (X3:Z3). + f2elm_t t0, t1, t2; + + sike_fp2add(Q->X, Q->Z, t0); // t0 = X+Z + sike_fp2sub(Q->X, Q->Z, t1); // t1 = X-Z + sike_fp2mul_mont(t0, coeff[0], t0); // t0 = coeff0*(X+Z) + sike_fp2mul_mont(t1, coeff[1], t1); // t1 = coeff1*(X-Z) + sike_fp2add(t0, t1, t2); // t2 = coeff0*(X+Z) + coeff1*(X-Z) + sike_fp2sub(t1, t0, t0); // t0 = coeff1*(X-Z) - coeff0*(X+Z) + sike_fp2sqr_mont(t2, t2); // t2 = [coeff0*(X+Z) + coeff1*(X-Z)]^2 + sike_fp2sqr_mont(t0, t0); // t0 = [coeff1*(X-Z) - coeff0*(X+Z)]^2 + sike_fp2mul_mont(Q->X, t2, Q->X); // X3final = X*[coeff0*(X+Z) + coeff1*(X-Z)]^2 + sike_fp2mul_mont(Q->Z, t0, Q->Z); // Z3final = Z*[coeff1*(X-Z) - coeff0*(X+Z)]^2 +} + + +void inv_3_way(f2elm_t z1, f2elm_t z2, f2elm_t z3) +{ // 3-way simultaneous inversion + // Input: z1,z2,z3 + // Output: 1/z1,1/z2,1/z3 (override inputs). + f2elm_t t0, t1, t2, t3; + + sike_fp2mul_mont(z1, z2, t0); // t0 = z1*z2 + sike_fp2mul_mont(z3, t0, t1); // t1 = z1*z2*z3 + sike_fp2inv_mont(t1); // t1 = 1/(z1*z2*z3) + sike_fp2mul_mont(z3, t1, t2); // t2 = 1/(z1*z2) + sike_fp2mul_mont(t2, z2, t3); // t3 = 1/z1 + sike_fp2mul_mont(t2, z1, z2); // z2 = 1/z2 + sike_fp2mul_mont(t0, t1, z3); // z3 = 1/z3 + sike_fp2copy(t3, z1); // z1 = 1/z1 +} + + +void get_A(const f2elm_t xP, const f2elm_t xQ, const f2elm_t xR, f2elm_t A) +{ // Given the x-coordinates of P, Q, and R, returns the value A corresponding to the Montgomery curve E_A: y^2=x^3+A*x^2+x such that R=Q-P on E_A. + // Input: the x-coordinates xP, xQ, and xR of the points P, Q and R. + // Output: the coefficient A corresponding to the curve E_A: y^2=x^3+A*x^2+x. + f2elm_t t0, t1, one = F2ELM_INIT; + + extern const struct params_t params; + sike_fpcopy(params.mont_one, one->c0); + sike_fp2add(xP, xQ, t1); // t1 = xP+xQ + sike_fp2mul_mont(xP, xQ, t0); // t0 = xP*xQ + sike_fp2mul_mont(xR, t1, A); // A = xR*t1 + sike_fp2add(t0, A, A); // A = A+t0 + sike_fp2mul_mont(t0, xR, t0); // t0 = t0*xR + sike_fp2sub(A, one, A); // A = A-1 + sike_fp2add(t0, t0, t0); // t0 = t0+t0 + sike_fp2add(t1, xR, t1); // t1 = t1+xR + sike_fp2add(t0, t0, t0); // t0 = t0+t0 + sike_fp2sqr_mont(A, A); // A = A^2 + sike_fp2inv_mont(t0); // t0 = 1/t0 + sike_fp2mul_mont(A, t0, A); // A = A*t0 + sike_fp2sub(A, t1, A); // Afinal = A-t1 +} + + +void j_inv(const f2elm_t A, const f2elm_t C, f2elm_t jinv) +{ // Computes the j-invariant of a Montgomery curve with projective constant. + // Input: A,C in GF(p^2). + // Output: j=256*(A^2-3*C^2)^3/(C^4*(A^2-4*C^2)), which is the j-invariant of the Montgomery curve B*y^2=x^3+(A/C)*x^2+x or (equivalently) j-invariant of B'*y^2=C*x^3+A*x^2+C*x. + f2elm_t t0, t1; + + sike_fp2sqr_mont(A, jinv); // jinv = A^2 + sike_fp2sqr_mont(C, t1); // t1 = C^2 + sike_fp2add(t1, t1, t0); // t0 = t1+t1 + sike_fp2sub(jinv, t0, t0); // t0 = jinv-t0 + sike_fp2sub(t0, t1, t0); // t0 = t0-t1 + sike_fp2sub(t0, t1, jinv); // jinv = t0-t1 + sike_fp2sqr_mont(t1, t1); // t1 = t1^2 + sike_fp2mul_mont(jinv, t1, jinv); // jinv = jinv*t1 + sike_fp2add(t0, t0, t0); // t0 = t0+t0 + sike_fp2add(t0, t0, t0); // t0 = t0+t0 + sike_fp2sqr_mont(t0, t1); // t1 = t0^2 + sike_fp2mul_mont(t0, t1, t0); // t0 = t0*t1 + sike_fp2add(t0, t0, t0); // t0 = t0+t0 + sike_fp2add(t0, t0, t0); // t0 = t0+t0 + sike_fp2inv_mont(jinv); // jinv = 1/jinv + sike_fp2mul_mont(jinv, t0, jinv); // jinv = t0*jinv +} + + +void xDBLADD(point_proj_t P, point_proj_t Q, const f2elm_t xPQ, const f2elm_t A24) +{ // Simultaneous doubling and differential addition. + // Input: projective Montgomery points P=(XP:ZP) and Q=(XQ:ZQ) such that xP=XP/ZP and xQ=XQ/ZQ, affine difference xPQ=x(P-Q) and Montgomery curve constant A24=(A+2)/4. + // Output: projective Montgomery points P <- 2*P = (X2P:Z2P) such that x(2P)=X2P/Z2P, and Q <- P+Q = (XQP:ZQP) such that = x(Q+P)=XQP/ZQP. + f2elm_t t0, t1, t2; + + sike_fp2add(P->X, P->Z, t0); // t0 = XP+ZP + sike_fp2sub(P->X, P->Z, t1); // t1 = XP-ZP + sike_fp2sqr_mont(t0, P->X); // XP = (XP+ZP)^2 + sike_fp2sub(Q->X, Q->Z, t2); // t2 = XQ-ZQ + sike_fp2correction(t2); + sike_fp2add(Q->X, Q->Z, Q->X); // XQ = XQ+ZQ + sike_fp2mul_mont(t0, t2, t0); // t0 = (XP+ZP)*(XQ-ZQ) + sike_fp2sqr_mont(t1, P->Z); // ZP = (XP-ZP)^2 + sike_fp2mul_mont(t1, Q->X, t1); // t1 = (XP-ZP)*(XQ+ZQ) + sike_fp2sub(P->X, P->Z, t2); // t2 = (XP+ZP)^2-(XP-ZP)^2 + sike_fp2mul_mont(P->X, P->Z, P->X); // XP = (XP+ZP)^2*(XP-ZP)^2 + sike_fp2mul_mont(t2, A24, Q->X); // XQ = A24*[(XP+ZP)^2-(XP-ZP)^2] + sike_fp2sub(t0, t1, Q->Z); // ZQ = (XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ) + sike_fp2add(Q->X, P->Z, P->Z); // ZP = A24*[(XP+ZP)^2-(XP-ZP)^2]+(XP-ZP)^2 + sike_fp2add(t0, t1, Q->X); // XQ = (XP+ZP)*(XQ-ZQ)+(XP-ZP)*(XQ+ZQ) + sike_fp2mul_mont(P->Z, t2, P->Z); // ZP = [A24*[(XP+ZP)^2-(XP-ZP)^2]+(XP-ZP)^2]*[(XP+ZP)^2-(XP-ZP)^2] + sike_fp2sqr_mont(Q->Z, Q->Z); // ZQ = [(XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)]^2 + sike_fp2sqr_mont(Q->X, Q->X); // XQ = [(XP+ZP)*(XQ-ZQ)+(XP-ZP)*(XQ+ZQ)]^2 + sike_fp2mul_mont(Q->Z, xPQ, Q->Z); // ZQ = xPQ*[(XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)]^2 +} diff --git a/src/kem/sike/sike-p434-sha256/isogeny.h b/src/kem/sike/sike-p434-sha256/isogeny.h new file mode 100644 index 00000000..460c8c66 --- /dev/null +++ b/src/kem/sike/sike-p434-sha256/isogeny.h @@ -0,0 +1,49 @@ +#ifndef ISOGENY_H_ +#define ISOGENY_H_ + +// Computes [2^e](X:Z) on Montgomery curve with projective +// constant via e repeated doublings. +void xDBLe( + const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, + const f2elm_t C24, size_t e); +// Simultaneous doubling and differential addition. +void xDBLADD( + point_proj_t P, point_proj_t Q, const f2elm_t xPQ, + const f2elm_t A24); +// Tripling of a Montgomery point in projective coordinates (X:Z). +void xTPL( + const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, + const f2elm_t A24plus); +// Computes [3^e](X:Z) on Montgomery curve with projective constant +// via e repeated triplings. +void xTPLe( + const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, + const f2elm_t A24plus, size_t e); +// Given the x-coordinates of P, Q, and R, returns the value A +// corresponding to the Montgomery curve E_A: y^2=x^3+A*x^2+x such that R=Q-P on E_A. +void get_A( + const f2elm_t xP, const f2elm_t xQ, const f2elm_t xR, f2elm_t A); +// Computes the j-invariant of a Montgomery curve with projective constant. +void j_inv( + const f2elm_t A, const f2elm_t C, f2elm_t jinv); +// Computes the corresponding 4-isogeny of a projective Montgomery +// point (X4:Z4) of order 4. +void get_4_isog( + const point_proj_t P, f2elm_t A24plus, f2elm_t C24, f2elm_t* coeff); +// Computes the corresponding 3-isogeny of a projective Montgomery +// point (X3:Z3) of order 3. +void get_3_isog( + const point_proj_t P, f2elm_t A24minus, f2elm_t A24plus, + f2elm_t* coeff); +// Computes the 3-isogeny R=phi(X:Z), given projective point (X3:Z3) +// of order 3 on a Montgomery curve and a point P with coefficients given in coeff. +void eval_3_isog( + point_proj_t Q, f2elm_t* coeff); +// Evaluates the isogeny at the point (X:Z) in the domain of the isogeny. +void eval_4_isog( + point_proj_t P, f2elm_t* coeff); +// 3-way simultaneous inversion +void inv_3_way( + f2elm_t z1, f2elm_t z2, f2elm_t z3); + +#endif // ISOGENY_H_ diff --git a/src/kem/sike/sike-p434-sha256/params.c b/src/kem/sike/sike-p434-sha256/params.c new file mode 100644 index 00000000..b13f4c87 --- /dev/null +++ b/src/kem/sike/sike-p434-sha256/params.c @@ -0,0 +1,128 @@ +/******************************************************************************************** +* SIDH: an efficient supersingular isogeny cryptography library +* +* Abstract: supersingular isogeny parameters and generation of functions for P434 +*********************************************************************************************/ + +#include "utils.h" + +// Parameters for isogeny system "SIKE" +const struct params_t params = { + .prime = { + U64_TO_WORDS(0xFFFFFFFFFFFFFFFF), U64_TO_WORDS(0xFFFFFFFFFFFFFFFF), + U64_TO_WORDS(0xFFFFFFFFFFFFFFFF), U64_TO_WORDS(0xFDC1767AE2FFFFFF), + U64_TO_WORDS(0x7BC65C783158AEA3), U64_TO_WORDS(0x6CFC5FD681C52056), + U64_TO_WORDS(0x0002341F27177344) + }, + .prime_p1 = { + U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0x0000000000000000), + U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0xFDC1767AE3000000), + U64_TO_WORDS(0x7BC65C783158AEA3), U64_TO_WORDS(0x6CFC5FD681C52056), + U64_TO_WORDS(0x0002341F27177344) + }, + .prime_x2 = { + U64_TO_WORDS(0xFFFFFFFFFFFFFFFE), U64_TO_WORDS(0xFFFFFFFFFFFFFFFF), + U64_TO_WORDS(0xFFFFFFFFFFFFFFFF), U64_TO_WORDS(0xFB82ECF5C5FFFFFF), + U64_TO_WORDS(0xF78CB8F062B15D47), U64_TO_WORDS(0xD9F8BFAD038A40AC), + U64_TO_WORDS(0x0004683E4E2EE688) + }, + .A_gen = { + U64_TO_WORDS(0x05ADF455C5C345BF), U64_TO_WORDS(0x91935C5CC767AC2B), + U64_TO_WORDS(0xAFE4E879951F0257), U64_TO_WORDS(0x70E792DC89FA27B1), + U64_TO_WORDS(0xF797F526BB48C8CD), U64_TO_WORDS(0x2181DB6131AF621F), + U64_TO_WORDS(0x00000A1C08B1ECC4), // XPA0 + U64_TO_WORDS(0x74840EB87CDA7788), U64_TO_WORDS(0x2971AA0ECF9F9D0B), + U64_TO_WORDS(0xCB5732BDF41715D5), U64_TO_WORDS(0x8CD8E51F7AACFFAA), + U64_TO_WORDS(0xA7F424730D7E419F), U64_TO_WORDS(0xD671EB919A179E8C), + U64_TO_WORDS(0x0000FFA26C5A924A), // XPA1 + U64_TO_WORDS(0xFEC6E64588B7273B), U64_TO_WORDS(0xD2A626D74CBBF1C6), + U64_TO_WORDS(0xF8F58F07A78098C7), U64_TO_WORDS(0xE23941F470841B03), + U64_TO_WORDS(0x1B63EDA2045538DD), U64_TO_WORDS(0x735CFEB0FFD49215), + U64_TO_WORDS(0x0001C4CB77542876), // XQA0 + U64_TO_WORDS(0xADB0F733C17FFDD6), U64_TO_WORDS(0x6AFFBD037DA0A050), + U64_TO_WORDS(0x680EC43DB144E02F), U64_TO_WORDS(0x1E2E5D5FF524E374), + U64_TO_WORDS(0xE2DDA115260E2995), U64_TO_WORDS(0xA6E4B552E2EDE508), + U64_TO_WORDS(0x00018ECCDDF4B53E), // XQA1 + U64_TO_WORDS(0x01BA4DB518CD6C7D), U64_TO_WORDS(0x2CB0251FE3CC0611), + U64_TO_WORDS(0x259B0C6949A9121B), U64_TO_WORDS(0x60E17AC16D2F82AD), + U64_TO_WORDS(0x3AA41F1CE175D92D), U64_TO_WORDS(0x413FBE6A9B9BC4F3), + U64_TO_WORDS(0x00022A81D8D55643), // XRA0 + U64_TO_WORDS(0xB8ADBC70FC82E54A), U64_TO_WORDS(0xEF9CDDB0D5FADDED), + U64_TO_WORDS(0x5820C734C80096A0), U64_TO_WORDS(0x7799994BAA96E0E4), + U64_TO_WORDS(0x044961599E379AF8), U64_TO_WORDS(0xDB2B94FBF09F27E2), + U64_TO_WORDS(0x0000B87FC716C0C6) // XRA1 + }, + .B_gen = { + U64_TO_WORDS(0x6E5497556EDD48A3), U64_TO_WORDS(0x2A61B501546F1C05), + U64_TO_WORDS(0xEB919446D049887D), U64_TO_WORDS(0x5864A4A69D450C4F), + U64_TO_WORDS(0xB883F276A6490D2B), U64_TO_WORDS(0x22CC287022D5F5B9), + U64_TO_WORDS(0x0001BED4772E551F), // XPB0 + U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0x0000000000000000), + U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0x0000000000000000), + U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0x0000000000000000), + U64_TO_WORDS(0x0000000000000000), // XPB1 + U64_TO_WORDS(0xFAE2A3F93D8B6B8E), U64_TO_WORDS(0x494871F51700FE1C), + U64_TO_WORDS(0xEF1A94228413C27C), U64_TO_WORDS(0x498FF4A4AF60BD62), + U64_TO_WORDS(0xB00AD2A708267E8A), U64_TO_WORDS(0xF4328294E017837F), + U64_TO_WORDS(0x000034080181D8AE), // XQB0 + U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0x0000000000000000), + U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0x0000000000000000), + U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0x0000000000000000), + U64_TO_WORDS(0x0000000000000000), // XQB1 + U64_TO_WORDS(0x283B34FAFEFDC8E4), U64_TO_WORDS(0x9208F44977C3E647), + U64_TO_WORDS(0x7DEAE962816F4E9A), U64_TO_WORDS(0x68A2BA8AA262EC9D), + U64_TO_WORDS(0x8176F112EA43F45B), U64_TO_WORDS(0x02106D022634F504), + U64_TO_WORDS(0x00007E8A50F02E37), // XRB0 + U64_TO_WORDS(0xB378B7C1DA22CCB1), U64_TO_WORDS(0x6D089C99AD1D9230), + U64_TO_WORDS(0xEBE15711813E2369), U64_TO_WORDS(0x2B35A68239D48A53), + U64_TO_WORDS(0x445F6FD138407C93), U64_TO_WORDS(0xBEF93B29A3F6B54B), + U64_TO_WORDS(0x000173FA910377D3) // XRB1 + }, + .mont_R2 = { + U64_TO_WORDS(0x28E55B65DCD69B30), U64_TO_WORDS(0xACEC7367768798C2), + U64_TO_WORDS(0xAB27973F8311688D), U64_TO_WORDS(0x175CC6AF8D6C7C0B), + U64_TO_WORDS(0xABCD92BF2DDE347E), U64_TO_WORDS(0x69E16A61C7686D9A), + U64_TO_WORDS(0x000025A89BCDD12A) + }, + .mont_one = { + U64_TO_WORDS(0x000000000000742C), U64_TO_WORDS(0x0000000000000000), + U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0xB90FF404FC000000), + U64_TO_WORDS(0xD801A4FB559FACD4), U64_TO_WORDS(0xE93254545F77410C), + U64_TO_WORDS(0x0000ECEEA7BD2EDA) + }, + .mont_six = { + U64_TO_WORDS(0x000000000002B90A), U64_TO_WORDS(0x0000000000000000), + U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0x5ADCCB2822000000), + U64_TO_WORDS(0x187D24F39F0CAFB4), U64_TO_WORDS(0x9D353A4D394145A0), + U64_TO_WORDS(0x00012559A0403298) + }, + .A_strat = { + 0x30, 0x1C, 0x10, 0x08, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, + 0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x08, 0x04, + 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, + 0x02, 0x01, 0x01, 0x0D, 0x07, 0x04, 0x02, 0x01, 0x01, 0x02, + 0x01, 0x01, 0x03, 0x02, 0x01, 0x01, 0x01, 0x01, 0x05, 0x04, + 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x01, + 0x15, 0x0C, 0x07, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, + 0x03, 0x02, 0x01, 0x01, 0x01, 0x01, 0x05, 0x03, 0x02, 0x01, + 0x01, 0x01, 0x01, 0x02, 0x01, 0x01, 0x01, 0x09, 0x05, 0x03, + 0x02, 0x01, 0x01, 0x01, 0x01, 0x02, 0x01, 0x01, 0x01, 0x04, + 0x02, 0x01, 0x01, 0x01, 0x02, 0x01, 0x01 + }, + .B_strat = { + 0x42, 0x21, 0x11, 0x09, 0x05, 0x03, 0x02, 0x01, 0x01, 0x01, + 0x01, 0x02, 0x01, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x01, + 0x02, 0x01, 0x01, 0x08, 0x04, 0x02, 0x01, 0x01, 0x01, 0x02, + 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x10, + 0x08, 0x04, 0x02, 0x01, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04, + 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x08, 0x04, 0x02, 0x01, + 0x01, 0x02, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, + 0x01, 0x20, 0x10, 0x08, 0x04, 0x03, 0x01, 0x01, 0x01, 0x01, + 0x02, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, + 0x08, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04, 0x02, + 0x01, 0x01, 0x02, 0x01, 0x01, 0x10, 0x08, 0x04, 0x02, 0x01, + 0x01, 0x02, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, + 0x01, 0x08, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04, + 0x02, 0x01, 0x01, 0x02, 0x01, 0x01 + } +}; diff --git a/src/kem/sike/sike-p434-sha256/sike.c b/src/kem/sike/sike-p434-sha256/sike.c new file mode 100644 index 00000000..f00ebe76 --- /dev/null +++ b/src/kem/sike/sike-p434-sha256/sike.c @@ -0,0 +1,517 @@ +/******************************************************************************************** +* SIDH: an efficient supersingular isogeny cryptography library +* +* Abstract: supersingular isogeny key encapsulation (SIKE) protocol +*********************************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include "utils.h" +#include "isogeny.h" +#include "fpx.h" + +extern const struct params_t params; + +// SIDH_JINV_BYTESZ is a number of bytes used for encoding j-invariant. +#define SIDH_JINV_BYTESZ 110U +// SIDH_PRV_A_BITSZ is a number of bits of SIDH private key (2-isogeny) +#define SIDH_PRV_A_BITSZ 216U +// SIDH_PRV_A_BITSZ is a number of bits of SIDH private key (3-isogeny) +#define SIDH_PRV_B_BITSZ 217U +// MAX_INT_POINTS_ALICE is a number of points used in 2-isogeny tree computation +#define MAX_INT_POINTS_ALICE 7U +// MAX_INT_POINTS_ALICE is a number of points used in 3-isogeny tree computation +#define MAX_INT_POINTS_BOB 8U + +// Swap points. +// If option = 0 then P <- P and Q <- Q, else if option = 0xFF...FF then P <- Q and Q <- P +#if !defined(ARCH_X86_64) || defined(ARCH_GENERIC) +static void sike_cswap(point_proj_t P, point_proj_t Q, const crypto_word_t option) +{ + crypto_word_t temp; + for (size_t i = 0; i < NWORDS_FIELD; i++) { + temp = option & (P->X->c0[i] ^ Q->X->c0[i]); + P->X->c0[i] = temp ^ P->X->c0[i]; + Q->X->c0[i] = temp ^ Q->X->c0[i]; + temp = option & (P->Z->c0[i] ^ Q->Z->c0[i]); + P->Z->c0[i] = temp ^ P->Z->c0[i]; + Q->Z->c0[i] = temp ^ Q->Z->c0[i]; + temp = option & (P->X->c1[i] ^ Q->X->c1[i]); + P->X->c1[i] = temp ^ P->X->c1[i]; + Q->X->c1[i] = temp ^ Q->X->c1[i]; + temp = option & (P->Z->c1[i] ^ Q->Z->c1[i]); + P->Z->c1[i] = temp ^ P->Z->c1[i]; + Q->Z->c1[i] = temp ^ Q->Z->c1[i]; + } +} +#endif + +// Swap points. +// If option = 0 then P <- P and Q <- Q, else if option = 0xFF...FF then P <- Q and Q <- P +static inline void sike_fp2cswap(point_proj_t P, point_proj_t Q, const crypto_word_t option) +{ +#if defined(ARCH_X86_64) && !defined(ARCH_GENERIC) + sike_cswap_asm(P, Q, option); +#else + sike_cswap(P, Q, option); +#endif +} + +static void ladder3Pt( + const f2elm_t xP, const f2elm_t xQ, const f2elm_t xPQ, const uint8_t* m, + int is_A, point_proj_t R, const f2elm_t A) { + point_proj_t R0 = POINT_PROJ_INIT, R2 = POINT_PROJ_INIT; + f2elm_t A24 = F2ELM_INIT; + crypto_word_t mask; + int bit, swap, prevbit = 0; + + const size_t nbits = is_A?SIDH_PRV_A_BITSZ:SIDH_PRV_B_BITSZ; + + // Initializing constant + sike_fpcopy(params.mont_one, A24[0].c0); + sike_fp2add(A24, A24, A24); + sike_fp2add(A, A24, A24); + sike_fp2div2(A24, A24); + sike_fp2div2(A24, A24); // A24 = (A+2)/4 + + // Initializing points + sike_fp2copy(xQ, R0->X); + sike_fpcopy(params.mont_one, R0->Z[0].c0); + sike_fp2copy(xPQ, R2->X); + sike_fpcopy(params.mont_one, R2->Z[0].c0); + sike_fp2copy(xP, R->X); + sike_fpcopy(params.mont_one, R->Z[0].c0); + memset(R->Z->c1, 0, sizeof(R->Z->c1)); + + // Main loop + for (size_t i = 0; i < nbits; i++) { + bit = (m[i >> 3] >> (i & 7)) & 1; + swap = bit ^ prevbit; + prevbit = bit; + mask = 0 - (crypto_word_t)swap; + + sike_fp2cswap(R, R2, mask); + xDBLADD(R0, R2, R->X, A24); + sike_fp2mul_mont(R2->X, R->Z, R2->X); + } + swap = 0 ^ prevbit; + mask = 0 - (crypto_word_t)swap; + sike_fp2cswap(R, R2, mask); +} + +// Initialization of basis points +static inline void sike_init_basis(const crypto_word_t *gen, f2elm_t XP, f2elm_t XQ, f2elm_t XR) { + sike_fpcopy(gen, XP->c0); + sike_fpcopy(gen + NWORDS_FIELD, XP->c1); + sike_fpcopy(gen + 2*NWORDS_FIELD, XQ->c0); + sike_fpcopy(gen + 3*NWORDS_FIELD, XQ->c1); + sike_fpcopy(gen + 4*NWORDS_FIELD, XR->c0); + sike_fpcopy(gen + 5*NWORDS_FIELD, XR->c1); +} + +// Conversion of GF(p^2) element from Montgomery to standard representation. +static inline void sike_fp2_encode(const f2elm_t x, uint8_t *enc) { + f2elm_t t; + sike_from_fp2mont(x, t); + + // convert to bytes in little endian form + for (size_t i=0; i> (8*(i%LSZ))) & 0xFF; + enc[i+FIELD_BYTESZ] = (t[0].c1[i/LSZ] >> (8*(i%LSZ))) & 0xFF; + } +} + +// Parse byte sequence back into GF(p^2) element, and conversion to Montgomery representation. +// Elements over GF(p503) are encoded in 63 octets in little endian format +// (i.e., the least significant octet is located in the lowest memory address). +static inline void fp2_decode(const uint8_t *enc, f2elm_t t) { + memset(t[0].c0, 0, sizeof(t[0].c0)); + memset(t[0].c1, 0, sizeof(t[0].c1)); + // convert bytes in little endian form to f2elm_t + for (size_t i = 0; i < FIELD_BYTESZ; i++) { + t[0].c0[i/LSZ] |= ((crypto_word_t)enc[i+ 0]) << (8*(i%LSZ)); + t[0].c1[i/LSZ] |= ((crypto_word_t)enc[i+FIELD_BYTESZ]) << (8*(i%LSZ)); + } + sike_to_fp2mont(t, t); +} + +// Alice's ephemeral public key generation +// Input: a private key prA in the range [0, 2^250 - 1], stored in 32 bytes. +// Output: the public key pkA consisting of 3 GF(p503^2) elements encoded in 378 bytes. +static void gen_iso_A(const uint8_t* skA, uint8_t* pkA) +{ + point_proj_t R, pts[MAX_INT_POINTS_ALICE]; + point_proj_t phiP = POINT_PROJ_INIT; + point_proj_t phiQ = POINT_PROJ_INIT; + point_proj_t phiR = POINT_PROJ_INIT; + f2elm_t XPA, XQA, XRA, coeff[3]; + f2elm_t A24plus = F2ELM_INIT; + f2elm_t C24 = F2ELM_INIT; + f2elm_t A = F2ELM_INIT; + unsigned int m, index = 0, pts_index[MAX_INT_POINTS_ALICE], npts = 0, ii = 0; + + // Initialize basis points + sike_init_basis(params.A_gen, XPA, XQA, XRA); + sike_init_basis(params.B_gen, phiP->X, phiQ->X, phiR->X); + sike_fpcopy(params.mont_one, (phiP->Z)->c0); + sike_fpcopy(params.mont_one, (phiQ->Z)->c0); + sike_fpcopy(params.mont_one, (phiR->Z)->c0); + + // Initialize constants: A24plus = A+2C, C24 = 4C, where A=6, C=1 + sike_fpcopy(params.mont_one, A24plus->c0); + sike_fp2add(A24plus, A24plus, A24plus); + sike_fp2add(A24plus, A24plus, C24); + sike_fp2add(A24plus, C24, A); + sike_fp2add(C24, C24, A24plus); + + // Retrieve kernel point + ladder3Pt(XPA, XQA, XRA, skA, 1, R, A); + + // Traverse tree + index = 0; + for (size_t row = 1; row < A_max; row++) { + while (index < A_max-row) { + sike_fp2copy(R->X, pts[npts]->X); + sike_fp2copy(R->Z, pts[npts]->Z); + pts_index[npts++] = index; + m = params.A_strat[ii++]; + xDBLe(R, R, A24plus, C24, (2*m)); + index += m; + } + get_4_isog(R, A24plus, C24, coeff); + + for (size_t i = 0; i < npts; i++) { + eval_4_isog(pts[i], coeff); + } + eval_4_isog(phiP, coeff); + eval_4_isog(phiQ, coeff); + eval_4_isog(phiR, coeff); + + sike_fp2copy(pts[npts-1]->X, R->X); + sike_fp2copy(pts[npts-1]->Z, R->Z); + index = pts_index[npts-1]; + npts -= 1; + } + + get_4_isog(R, A24plus, C24, coeff); + eval_4_isog(phiP, coeff); + eval_4_isog(phiQ, coeff); + eval_4_isog(phiR, coeff); + + inv_3_way(phiP->Z, phiQ->Z, phiR->Z); + sike_fp2mul_mont(phiP->X, phiP->Z, phiP->X); + sike_fp2mul_mont(phiQ->X, phiQ->Z, phiQ->X); + sike_fp2mul_mont(phiR->X, phiR->Z, phiR->X); + + // Format public key + sike_fp2_encode(phiP->X, pkA); + sike_fp2_encode(phiQ->X, pkA + SIDH_JINV_BYTESZ); + sike_fp2_encode(phiR->X, pkA + 2*SIDH_JINV_BYTESZ); +} + +// Bob's ephemeral key-pair generation +// It produces a private key skB and computes the public key pkB. +// The private key is an integer in the range [0, 2^Floor(Log(2,3^159)) - 1], stored in 32 bytes. +// The public key consists of 3 GF(p503^2) elements encoded in 378 bytes. +static void gen_iso_B(const uint8_t* skB, uint8_t* pkB) +{ + point_proj_t R, pts[MAX_INT_POINTS_BOB]; + point_proj_t phiP = POINT_PROJ_INIT; + point_proj_t phiQ = POINT_PROJ_INIT; + point_proj_t phiR = POINT_PROJ_INIT; + f2elm_t XPB, XQB, XRB, coeff[3]; + f2elm_t A24plus = F2ELM_INIT; + f2elm_t A24minus = F2ELM_INIT; + f2elm_t A = F2ELM_INIT; + unsigned int m, index = 0, pts_index[MAX_INT_POINTS_BOB], npts = 0, ii = 0; + + // Initialize basis points + sike_init_basis(params.B_gen, XPB, XQB, XRB); + sike_init_basis(params.A_gen, phiP->X, phiQ->X, phiR->X); + sike_fpcopy(params.mont_one, (phiP->Z)->c0); + sike_fpcopy(params.mont_one, (phiQ->Z)->c0); + sike_fpcopy(params.mont_one, (phiR->Z)->c0); + + // Initialize constants: A24minus = A-2C, A24plus = A+2C, where A=6, C=1 + sike_fpcopy(params.mont_one, A24plus->c0); + sike_fp2add(A24plus, A24plus, A24plus); + sike_fp2add(A24plus, A24plus, A24minus); + sike_fp2add(A24plus, A24minus, A); + sike_fp2add(A24minus, A24minus, A24plus); + + // Retrieve kernel point + ladder3Pt(XPB, XQB, XRB, skB, 0, R, A); + + // Traverse tree + index = 0; + for (size_t row = 1; row < B_max; row++) { + while (index < B_max-row) { + sike_fp2copy(R->X, pts[npts]->X); + sike_fp2copy(R->Z, pts[npts]->Z); + pts_index[npts++] = index; + m = params.B_strat[ii++]; + xTPLe(R, R, A24minus, A24plus, m); + index += m; + } + get_3_isog(R, A24minus, A24plus, coeff); + + for (size_t i = 0; i < npts; i++) { + eval_3_isog(pts[i], coeff); + } + eval_3_isog(phiP, coeff); + eval_3_isog(phiQ, coeff); + eval_3_isog(phiR, coeff); + + sike_fp2copy(pts[npts-1]->X, R->X); + sike_fp2copy(pts[npts-1]->Z, R->Z); + index = pts_index[npts-1]; + npts -= 1; + } + + get_3_isog(R, A24minus, A24plus, coeff); + eval_3_isog(phiP, coeff); + eval_3_isog(phiQ, coeff); + eval_3_isog(phiR, coeff); + + inv_3_way(phiP->Z, phiQ->Z, phiR->Z); + sike_fp2mul_mont(phiP->X, phiP->Z, phiP->X); + sike_fp2mul_mont(phiQ->X, phiQ->Z, phiQ->X); + sike_fp2mul_mont(phiR->X, phiR->Z, phiR->X); + + // Format public key + sike_fp2_encode(phiP->X, pkB); + sike_fp2_encode(phiQ->X, pkB + SIDH_JINV_BYTESZ); + sike_fp2_encode(phiR->X, pkB + 2*SIDH_JINV_BYTESZ); +} + +// Alice's ephemeral shared secret computation +// It produces a shared secret key ssA using her secret key skA and Bob's public key pkB +// Inputs: Alice's skA is an integer in the range [0, 2^250 - 1], stored in 32 bytes. +// Bob's pkB consists of 3 GF(p503^2) elements encoded in 378 bytes. +// Output: a shared secret ssA that consists of one element in GF(p503^2) encoded in 126 bytes. +static void ex_iso_A(const uint8_t* skA, const uint8_t* pkB, uint8_t* ssA) +{ + point_proj_t R, pts[MAX_INT_POINTS_ALICE]; + f2elm_t coeff[3], PKB[3], jinv; + f2elm_t A24plus = F2ELM_INIT; + f2elm_t C24 = F2ELM_INIT; + f2elm_t A = F2ELM_INIT; + unsigned int m, index = 0, pts_index[MAX_INT_POINTS_ALICE], npts = 0, ii = 0; + + // Initialize images of Bob's basis + fp2_decode(pkB, PKB[0]); + fp2_decode(pkB + SIDH_JINV_BYTESZ, PKB[1]); + fp2_decode(pkB + 2*SIDH_JINV_BYTESZ, PKB[2]); + + // Initialize constants + get_A(PKB[0], PKB[1], PKB[2], A); + sike_fpadd(params.mont_one, params.mont_one, C24->c0); + sike_fp2add(A, C24, A24plus); + sike_fpadd(C24->c0, C24->c0, C24->c0); + + // Retrieve kernel point + ladder3Pt(PKB[0], PKB[1], PKB[2], skA, 1, R, A); + + // Traverse tree + index = 0; + for (size_t row = 1; row < A_max; row++) { + while (index < A_max-row) { + sike_fp2copy(R->X, pts[npts]->X); + sike_fp2copy(R->Z, pts[npts]->Z); + pts_index[npts++] = index; + m = params.A_strat[ii++]; + xDBLe(R, R, A24plus, C24, (2*m)); + index += m; + } + get_4_isog(R, A24plus, C24, coeff); + + for (size_t i = 0; i < npts; i++) { + eval_4_isog(pts[i], coeff); + } + + sike_fp2copy(pts[npts-1]->X, R->X); + sike_fp2copy(pts[npts-1]->Z, R->Z); + index = pts_index[npts-1]; + npts -= 1; + } + + get_4_isog(R, A24plus, C24, coeff); + sike_fp2add(A24plus, A24plus, A24plus); + sike_fp2sub(A24plus, C24, A24plus); + sike_fp2add(A24plus, A24plus, A24plus); + j_inv(A24plus, C24, jinv); + sike_fp2_encode(jinv, ssA); +} + +// Bob's ephemeral shared secret computation +// It produces a shared secret key ssB using his secret key skB and Alice's public key pkA +// Inputs: Bob's skB is an integer in the range [0, 2^Floor(Log(2,3^159)) - 1], stored in 32 bytes. +// Alice's pkA consists of 3 GF(p503^2) elements encoded in 378 bytes. +// Output: a shared secret ssB that consists of one element in GF(p503^2) encoded in 126 bytes. +static void ex_iso_B(const uint8_t* skB, const uint8_t* pkA, uint8_t* ssB) +{ + point_proj_t R, pts[MAX_INT_POINTS_BOB]; + f2elm_t coeff[3], PKB[3], jinv; + f2elm_t A24plus = F2ELM_INIT; + f2elm_t A24minus = F2ELM_INIT; + f2elm_t A = F2ELM_INIT; + unsigned int m, index = 0, pts_index[MAX_INT_POINTS_BOB], npts = 0, ii = 0; + + // Initialize images of Alice's basis + fp2_decode(pkA, PKB[0]); + fp2_decode(pkA + SIDH_JINV_BYTESZ, PKB[1]); + fp2_decode(pkA + 2*SIDH_JINV_BYTESZ, PKB[2]); + + // Initialize constants + get_A(PKB[0], PKB[1], PKB[2], A); + sike_fpadd(params.mont_one, params.mont_one, A24minus->c0); + sike_fp2add(A, A24minus, A24plus); + sike_fp2sub(A, A24minus, A24minus); + + // Retrieve kernel point + ladder3Pt(PKB[0], PKB[1], PKB[2], skB, 0, R, A); + + // Traverse tree + index = 0; + for (size_t row = 1; row < B_max; row++) { + while (index < B_max-row) { + sike_fp2copy(R->X, pts[npts]->X); + sike_fp2copy(R->Z, pts[npts]->Z); + pts_index[npts++] = index; + m = params.B_strat[ii++]; + xTPLe(R, R, A24minus, A24plus, m); + index += m; + } + get_3_isog(R, A24minus, A24plus, coeff); + + for (size_t i = 0; i < npts; i++) { + eval_3_isog(pts[i], coeff); + } + + sike_fp2copy(pts[npts-1]->X, R->X); + sike_fp2copy(pts[npts-1]->Z, R->Z); + index = pts_index[npts-1]; + npts -= 1; + } + + get_3_isog(R, A24minus, A24plus, coeff); + sike_fp2add(A24plus, A24minus, A); + sike_fp2add(A, A, A); + sike_fp2sub(A24plus, A24minus, A24plus); + j_inv(A, A24plus, jinv); + sike_fp2_encode(jinv, ssB); +} + +int SIKE_keypair(uint8_t out_priv[SIKE_PRV_BYTESZ], + uint8_t out_pub[SIKE_PUB_BYTESZ]) { + // Calculate private key for Alice. Needs to be in range [0, 2^0xFA - 1] and < + // 253 bits + randombytes(out_priv, SIKE_PRV_BYTESZ); + out_priv[31] = (out_priv[31] | 0x01) & 0x03; + + gen_iso_B(out_priv, out_pub); + return 1; +} + +void SIKE_encaps(uint8_t out_shared_key[SIKE_SS_BYTESZ], + uint8_t out_ciphertext[SIKE_CT_BYTESZ], + const uint8_t pub_key[SIKE_PUB_BYTESZ]) { + // Secret buffer is reused by the function to store some ephemeral + // secret data. It's size must be maximum of 64, + // SIKE_MSG_BYTESZ and SIDH_PRV_A_BITSZ in bytes. + uint8_t secret[32]; // OZAPTF, why? + uint8_t j[SIDH_JINV_BYTESZ]; + uint8_t temp[SIKE_MSG_BYTESZ + SIKE_CT_BYTESZ]; + SHA256_CTX ctx; + + // Generate secret key for A + // secret key A = SHA256({0,1}^n || pub_key)) mod SIDH_PRV_A_BITSZ + randombytes(temp, SIKE_MSG_BYTESZ); + + sha256_init(&ctx); + sha256_update(&ctx, temp, SIKE_MSG_BYTESZ); + sha256_update(&ctx, pub_key, SIKE_PUB_BYTESZ); + sha256_final(&ctx, secret); + + // Generate public key for A - first part of the ciphertext + gen_iso_A(secret, out_ciphertext); + + // Generate c1: + // h = SHA256(j-invariant) + // c1 = h ^ m + ex_iso_A(secret, pub_key, j); + sha256_init(&ctx); + sha256_update(&ctx, j, sizeof(j)); + sha256_final(&ctx, secret); + + // c1 = h ^ m + uint8_t *c1 = &out_ciphertext[SIKE_PUB_BYTESZ]; + for (size_t i = 0; i < SIKE_MSG_BYTESZ; i++) { + c1[i] = temp[i] ^ secret[i]; + } + + sha256_init(&ctx); + sha256_update(&ctx, temp, SIKE_MSG_BYTESZ); + sha256_update(&ctx, out_ciphertext, SIKE_CT_BYTESZ); + sha256_final(&ctx, secret); + // Generate shared secret out_shared_key = SHA256(m||out_ciphertext) + memcpy(out_shared_key, secret, SIKE_SS_BYTESZ); +} + +void SIKE_decaps(uint8_t out_shared_key[SIKE_SS_BYTESZ], + const uint8_t ciphertext[SIKE_CT_BYTESZ], + const uint8_t pub_key[SIKE_PUB_BYTESZ], + const uint8_t priv_key[SIKE_PRV_BYTESZ]) { + // Secret buffer is reused by the function to store some ephemeral + // secret data. It's size must be maximum of 64, + // SIKE_MSG_BYTESZ and SIDH_PRV_A_BITSZ in bytes. + uint8_t secret[32]; + uint8_t j[SIDH_JINV_BYTESZ]; + uint8_t c0[SIKE_PUB_BYTESZ]; + uint8_t temp[SIKE_MSG_BYTESZ]; + uint8_t shared_nok[SIKE_MSG_BYTESZ]; + SHA256_CTX ctx; + + // This is OK as we are only using ephemeral keys in BoringSSL + randombytes(shared_nok, SIKE_MSG_BYTESZ); + + // Recover m + // Let ciphertext = c0 || c1 - both have fixed sizes + // m = F(j-invariant(c0, priv_key)) ^ c1 + ex_iso_B(priv_key, ciphertext, j); + + sha256_init(&ctx); + sha256_update(&ctx, j, sizeof(j)); + sha256_final(&ctx, secret); + + const uint8_t *c1 = &ciphertext[sizeof(c0)]; + for (size_t i = 0; i < SIKE_MSG_BYTESZ; i++) { + temp[i] = c1[i] ^ secret[i]; + } + + sha256_init(&ctx); + sha256_update(&ctx, temp, SIKE_MSG_BYTESZ); + sha256_update(&ctx, pub_key, SIKE_PUB_BYTESZ); + sha256_final(&ctx, secret); + + // Recover c0 = public key A + gen_iso_A(secret, c0); + crypto_word_t ok = ct_uint_eq( + ct_mem_eq(c0, ciphertext, SIKE_PUB_BYTESZ), 1); + for (size_t i = 0; i < SIKE_MSG_BYTESZ; i++) { + temp[i] = ct_select_8(ok, temp[i], shared_nok[i]); + } + + sha256_init(&ctx); + sha256_update(&ctx, temp, SIKE_MSG_BYTESZ); + sha256_update(&ctx, ciphertext, SIKE_CT_BYTESZ); + sha256_final(&ctx, secret); + + // Generate shared secret out_shared_key = SHA256(m||ciphertext) + memcpy(out_shared_key, secret, SIKE_SS_BYTESZ); +} diff --git a/src/kem/sike/sike-p434-sha256/utils.h b/src/kem/sike/sike-p434-sha256/utils.h new file mode 100644 index 00000000..87623d33 --- /dev/null +++ b/src/kem/sike/sike-p434-sha256/utils.h @@ -0,0 +1,231 @@ +/******************************************************************************************** +* SIDH: an efficient supersingular isogeny cryptography library +* +* Abstract: internal header file for P434 +*********************************************************************************************/ + +#ifndef UTILS_H_ +#define UTILS_H_ + +#include +#include + +// Conversion macro from number of bits to number of bytes +#define BITS_TO_BYTES(nbits) (((nbits)+7)/8) + +// Bit size of the field +#define BITS_FIELD 434 +// Byte size of the field +#define FIELD_BYTESZ BITS_TO_BYTES(BITS_FIELD) +// Number of 64-bit words of a 224-bit element +#define NBITS_ORDER 224 +#define NWORDS64_ORDER ((NBITS_ORDER+63)/64) +// Number of elements in Alice's strategy +#define A_max 108 +// Number of elements in Bob's strategy +#define B_max 137 +// Word size size +#define RADIX sizeof(crypto_word_t)*8 +// Byte size of a limb +#define LSZ sizeof(crypto_word_t) + +#if defined(CPU_64_BIT) + typedef uint64_t crypto_word_t; + // Number of words of a 434-bit field element + #define NWORDS_FIELD 7 + // Number of "0" digits in the least significant part of p434 + 1 + #define ZERO_WORDS 3 + // U64_TO_WORDS expands |x| for a |crypto_word_t| array literal. + #define U64_TO_WORDS(x) UINT64_C(x) +#else + typedef uint32_t crypto_word_t; + // Number of words of a 434-bit field element + #define NWORDS_FIELD 14 + // Number of "0" digits in the least significant part of p434 + 1 + #define ZERO_WORDS 6 + // U64_TO_WORDS expands |x| for a |crypto_word_t| array literal. + #define U64_TO_WORDS(x) \ + (uint32_t)(UINT64_C(x) & 0xffffffff), (uint32_t)(UINT64_C(x) >> 32) +#endif + +// Extended datatype support +#if !defined(HAS_UINT128) + typedef uint64_t uint128_t[2]; +#endif + +// The following functions return 1 (TRUE) if condition is true, 0 (FALSE) otherwise +// Digit multiplication +#define MUL(multiplier, multiplicand, hi, lo) digit_x_digit((multiplier), (multiplicand), &(lo)); + +// If mask |x|==0xff.ff set |x| to 1, otherwise 0 +#define M2B(x) ((x)>>(RADIX-1)) + +// Digit addition with carry +#define ADDC(carryIn, addend1, addend2, carryOut, sumOut) \ +do { \ + crypto_word_t tempReg = (addend1) + (crypto_word_t)(carryIn); \ + (sumOut) = (addend2) + tempReg; \ + (carryOut) = M2B(ct_uint_lt(tempReg, (crypto_word_t)(carryIn)) | \ + ct_uint_lt((sumOut), tempReg)); \ +} while(0) + +// Digit subtraction with borrow +#define SUBC(borrowIn, minuend, subtrahend, borrowOut, differenceOut) \ +do { \ + crypto_word_t tempReg = (minuend) - (subtrahend); \ + crypto_word_t borrowReg = M2B(ct_uint_lt((minuend), (subtrahend))); \ + borrowReg |= ((borrowIn) & ct_uint_eq(tempReg, 0)); \ + (differenceOut) = tempReg - (crypto_word_t)(borrowIn); \ + (borrowOut) = borrowReg; \ +} while(0) + +/* Old GCC 4.9 (jessie) doesn't implement {0} initialization properly, + which violates C11 as described in 6.7.9, 21 (similarily C99, 6.7.8). + Defines below are used to work around the bug, and provide a way + to initialize f2elem_t and point_proj_t structs. + Bug has been fixed in GCC6 (debian stretch). +*/ +#define F2ELM_INIT {{ {0}, {0} }} +#define POINT_PROJ_INIT {{ F2ELM_INIT, F2ELM_INIT }} + +// Datatype for representing 434-bit field elements (448-bit max.) +// Elements over GF(p434) are encoded in 63 octets in little endian format +// (i.e., the least significant octet is located in the lowest memory address). +typedef crypto_word_t felm_t[NWORDS_FIELD]; + +// An element in F_{p^2}, is composed of two coefficients from F_p, * i.e. +// Fp2 element = c0 + c1*i in F_{p^2} +// Datatype for representing double-precision 2x434-bit field elements (448-bit max.) +// Elements (a+b*i) over GF(p434^2), where a and b are defined over GF(p434), are +// encoded as {a, b}, with a in the lowest memory portion. +typedef struct { + felm_t c0; + felm_t c1; +} fp2; + +// Our F_{p^2} element type is a pointer to the struct. +typedef fp2 f2elm_t[1]; + +// Datatype for representing double-precision 2x434-bit +// field elements in contiguous memory. +typedef crypto_word_t dfelm_t[2*NWORDS_FIELD]; + +// Constants used during SIKE computation. +struct params_t { + // Stores a prime + const crypto_word_t prime[NWORDS_FIELD]; + // Stores prime + 1 + const crypto_word_t prime_p1[NWORDS_FIELD]; + // Stores prime * 2 + const crypto_word_t prime_x2[NWORDS_FIELD]; + // Alice's generator values {XPA0 + XPA1*i, XQA0 + XQA1*i, XRA0 + XRA1*i} + // in GF(prime^2), expressed in Montgomery representation + const crypto_word_t A_gen[6*NWORDS_FIELD]; + // Bob's generator values {XPB0 + XPB1*i, XQB0 + XQB1*i, XRB0 + XRB1*i} + // in GF(prime^2), expressed in Montgomery representation + const crypto_word_t B_gen[6*NWORDS_FIELD]; + // Montgomery constant mont_R2 = (2^448)^2 mod prime + const crypto_word_t mont_R2[NWORDS_FIELD]; + // Value 'one' in Montgomery representation + const crypto_word_t mont_one[NWORDS_FIELD]; + // Value '6' in Montgomery representation + const crypto_word_t mont_six[NWORDS_FIELD]; + // Fixed parameters for isogeny tree computation + const unsigned int A_strat[A_max-1]; + const unsigned int B_strat[B_max-1]; +}; + +// Point representation in projective XZ Montgomery coordinates. +typedef struct { + f2elm_t X; + f2elm_t Z; +} point_proj; +typedef point_proj point_proj_t[1]; + +// Checks whether two words are equal. Returns 1 in case it is, +// otherwise 0. +static inline crypto_word_t ct_uint_eq(crypto_word_t x, crypto_word_t y) +{ + // if x==y then t = 0 + crypto_word_t t = x ^ y; + // if x!=y t will have first bit set + t = (t >> 1) - t; + // return MSB - 1 in case x==y, otherwise 0 + return ((~t) >> (RADIX-1)); +} +// Constant time select. +// if pick == 1 (out = in1) +// if pick == 0 (out = in2) +// else out is undefined +static inline uint8_t ct_select_8(uint8_t flag, uint8_t in1, uint8_t in2) { + uint8_t mask = ((int8_t)(flag << 7))>>7; + return (in1&mask) | (in2&(~mask)); +} + +// Constant time memcmp. Returns 1 if p==q, otherwise 0 +static inline int ct_mem_eq(const void *p, const void *q, size_t n) +{ + const uint8_t *pp = (uint8_t*)p, *qq = (uint8_t*)q; + uint8_t a = 0; + + while (n--) a |= *pp++ ^ *qq++; + return (ct_uint_eq(a, 0)); +} + +/* +// Returns 1 if x> (RADIX-1)); +} +*/ + +/// OZAPTF: coppied from boringssl +static inline crypto_word_t constant_time_msb_w(crypto_word_t a) { + return 0u - (a >> (sizeof(a) * 8 - 1)); +} + +// constant_time_lt_w returns 0xff..f if a < b and 0 otherwise. +static inline crypto_word_t ct_uint_lt(crypto_word_t x, crypto_word_t y) +{ + /* + const crypto_word_t t1 = x^y; + const crypto_word_t t2 = x - y; + const crypto_word_t tt = x ^ (t1 | (t2^y)); + return (tt >> (RADIX-1)); + */ + // Consider the two cases of the problem: + // msb(a) == msb(b): a < b iff the MSB of a - b is set. + // msb(a) != msb(b): a < b iff the MSB of b is set. + // + // If msb(a) == msb(b) then the following evaluates as: + // msb(a^((a^b)|((a-b)^a))) == + // msb(a^((a-b) ^ a)) == (because msb(a^b) == 0) + // msb(a^a^(a-b)) == (rearranging) + // msb(a-b) (because ∀x. x^x == 0) + // + // Else, if msb(a) != msb(b) then the following evaluates as: + // msb(a^((a^b)|((a-b)^a))) == + // msb(a^(𝟙 | ((a-b)^a))) == (because msb(a^b) == 1 and 𝟙 + // represents a value s.t. msb(𝟙) = 1) + // msb(a^𝟙) == (because ORing with 1 results in 1) + // msb(b) + // + // + // Here is an SMT-LIB verification of this formula: + // + // (define-fun lt ((a (_ BitVec 32)) (b (_ BitVec 32))) (_ BitVec 32) + // (bvxor a (bvor (bvxor a b) (bvxor (bvsub a b) a))) + // ) + // + // (declare-fun a () (_ BitVec 32)) + // (declare-fun b () (_ BitVec 32)) + // + // (assert (not (= (= #x00000001 (bvlshr (lt a b) #x0000001f)) (bvult a b)))) + // (check-sat) + // (get-model) + return constant_time_msb_w(x^((x^y)|((x-y)^x))); +} +#endif // UTILS_H_ From 1096d2b87e58de568ad2e5d2109a53d48ff42667 Mon Sep 17 00:00:00 2001 From: Kris Kwiatkowski Date: Fri, 9 Apr 2021 00:44:27 +0100 Subject: [PATCH 02/12] update sike --- CMakeLists.txt | 1 + public/pqc/pqc.h | 3 +- src/capi/pqapi.c | 118 +- src/capi/schemes.h | 118 ++ src/kem/sike/includes/sike/sike.h | 73 ++ src/kem/sike/sike-p434-sha256/asm/fp-x86_64.S | 1095 ----------------- .../sike/sike-p434-sha256/asm/fp_generic.c | 179 --- src/kem/sike/sike-p434-sha256/fpx.c | 282 ----- src/kem/sike/sike-p434-sha256/fpx.h | 112 -- src/kem/sike/sike-p434-sha256/isogeny.c | 262 ---- src/kem/sike/sike-p434-sha256/isogeny.h | 49 - src/kem/sike/sike-p434-sha256/params.c | 128 -- src/kem/sike/sike-p434-sha256/sike.c | 517 -------- src/kem/sike/sike-p434-sha256/utils.h | 231 ---- src/rustapi/pqc-sys/src/bindings.rs | 3 +- test/katrunner/src/main.rs | 1 + 16 files changed, 198 insertions(+), 2974 deletions(-) create mode 100644 src/capi/schemes.h create mode 100644 src/kem/sike/includes/sike/sike.h delete mode 100644 src/kem/sike/sike-p434-sha256/asm/fp-x86_64.S delete mode 100644 src/kem/sike/sike-p434-sha256/asm/fp_generic.c delete mode 100644 src/kem/sike/sike-p434-sha256/fpx.c delete mode 100644 src/kem/sike/sike-p434-sha256/fpx.h delete mode 100644 src/kem/sike/sike-p434-sha256/isogeny.c delete mode 100644 src/kem/sike/sike-p434-sha256/isogeny.h delete mode 100644 src/kem/sike/sike-p434-sha256/params.c delete mode 100644 src/kem/sike/sike-p434-sha256/sike.c delete mode 100644 src/kem/sike/sike-p434-sha256/utils.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 7ee78b71..313200b3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -148,6 +148,7 @@ add_subdirectory(src/kem/ntru_prime/ntrulpr857/clean) add_subdirectory(src/kem/hqc/hqc-rmrs-128/clean) add_subdirectory(src/kem/hqc/hqc-rmrs-192/clean) add_subdirectory(src/kem/hqc/hqc-rmrs-256/clean) +add_subdirectory(src/kem/sike) # Hardware optimized targets if(${ARCH} STREQUAL "ARCH_x86_64") diff --git a/public/pqc/pqc.h b/public/pqc/pqc.h index 8fd651a3..9cb862af 100644 --- a/public/pqc/pqc.h +++ b/public/pqc/pqc.h @@ -63,7 +63,8 @@ extern "C" { _(SABER) \ _(HQCRMRS128) \ _(HQCRMRS192) \ - _(HQCRMRS256) + _(HQCRMRS256) \ + _(SIKE434) // Defines IDs for each algorithm. The // PQC_ALG_SIG/KEM_MAX indicates number diff --git a/src/capi/pqapi.c b/src/capi/pqapi.c index bd41aa4f..d00260d3 100644 --- a/src/capi/pqapi.c +++ b/src/capi/pqapi.c @@ -2,123 +2,7 @@ #include #include -// PQClean include -#include "sign/rainbow/rainbowV-classic/clean/api.h" -#include "sign/rainbow/rainbowI-classic/clean/api.h" -#include "sign/rainbow/rainbowIII-classic/clean/api.h" -#include "sign/sphincs/sphincs-sha256-192f-simple/clean/api.h" -#include "sign/sphincs/sphincs-sha256-192f-simple/avx2/api.h" -#include "sign/sphincs/sphincs-shake256-256f-simple/clean/api.h" -#include "sign/sphincs/sphincs-shake256-256f-simple/avx2/api.h" -#include "sign/sphincs/sphincs-shake256-192f-robust/clean/api.h" -#include "sign/sphincs/sphincs-shake256-192f-robust/avx2/api.h" -#include "sign/sphincs/sphincs-shake256-128f-simple/clean/api.h" -#include "sign/sphincs/sphincs-shake256-128f-simple/avx2/api.h" -#include "sign/sphincs/sphincs-shake256-256s-simple/clean/api.h" -#include "sign/sphincs/sphincs-shake256-256s-simple/avx2/api.h" -#include "sign/sphincs/sphincs-shake256-128s-simple/clean/api.h" -#include "sign/sphincs/sphincs-shake256-128s-simple/avx2/api.h" -#include "sign/sphincs/sphincs-sha256-128f-robust/clean/api.h" -#include "sign/sphincs/sphincs-sha256-128f-robust/avx2/api.h" -#include "sign/sphincs/sphincs-sha256-192s-robust/clean/api.h" -#include "sign/sphincs/sphincs-sha256-192s-robust/avx2/api.h" -#include "sign/sphincs/sphincs-shake256-128f-robust/clean/api.h" -#include "sign/sphincs/sphincs-shake256-128f-robust/avx2/api.h" -#include "sign/sphincs/sphincs-shake256-128s-robust/clean/api.h" -#include "sign/sphincs/sphincs-shake256-128s-robust/avx2/api.h" -#include "sign/sphincs/sphincs-shake256-256s-robust/clean/api.h" -#include "sign/sphincs/sphincs-shake256-256s-robust/avx2/api.h" -#include "sign/sphincs/sphincs-sha256-192s-simple/clean/api.h" -#include "sign/sphincs/sphincs-sha256-192s-simple/avx2/api.h" -#include "sign/sphincs/sphincs-shake256-192s-simple/clean/api.h" -#include "sign/sphincs/sphincs-shake256-192s-simple/avx2/api.h" -#include "sign/sphincs/sphincs-shake256-192s-robust/clean/api.h" -#include "sign/sphincs/sphincs-shake256-192s-robust/avx2/api.h" -#include "sign/sphincs/sphincs-shake256-192f-simple/clean/api.h" -#include "sign/sphincs/sphincs-shake256-192f-simple/avx2/api.h" -#include "sign/sphincs/sphincs-sha256-256s-simple/clean/api.h" -#include "sign/sphincs/sphincs-sha256-256s-simple/avx2/api.h" -#include "sign/sphincs/sphincs-sha256-128s-simple/clean/api.h" -#include "sign/sphincs/sphincs-sha256-128s-simple/avx2/api.h" -#include "sign/sphincs/sphincs-shake256-256f-robust/clean/api.h" -#include "sign/sphincs/sphincs-shake256-256f-robust/avx2/api.h" -#include "sign/sphincs/sphincs-sha256-256f-robust/clean/api.h" -#include "sign/sphincs/sphincs-sha256-256f-robust/avx2/api.h" -#include "sign/sphincs/sphincs-sha256-256f-simple/clean/api.h" -#include "sign/sphincs/sphincs-sha256-256f-simple/avx2/api.h" -#include "sign/sphincs/sphincs-sha256-256s-robust/clean/api.h" -#include "sign/sphincs/sphincs-sha256-256s-robust/avx2/api.h" -#include "sign/sphincs/sphincs-sha256-128s-robust/clean/api.h" -#include "sign/sphincs/sphincs-sha256-128s-robust/avx2/api.h" -#include "sign/sphincs/sphincs-sha256-128f-simple/clean/api.h" -#include "sign/sphincs/sphincs-sha256-128f-simple/avx2/api.h" -#include "sign/sphincs/sphincs-sha256-192f-robust/clean/api.h" -#include "sign/sphincs/sphincs-sha256-192f-robust/avx2/api.h" -#include "sign/falcon/falcon-1024/clean/api.h" -#include "sign/falcon/falcon-1024/avx2/api.h" -#include "sign/falcon/falcon-512/clean/api.h" -#include "sign/falcon/falcon-512/avx2/api.h" -#include "sign/dilithium/dilithium2/clean/api.h" -#include "sign/dilithium/dilithium2/avx2/api.h" -#include "sign/dilithium/dilithium3/clean/api.h" -#include "sign/dilithium/dilithium3/avx2/api.h" -#include "sign/dilithium/dilithium5/clean/api.h" -#include "sign/dilithium/dilithium5/avx2/api.h" -#include "kem/ntru/ntruhps4096821/clean/api.h" -#include "kem/ntru/ntruhps4096821/avx2/api.h" -#include "kem/ntru/ntruhps2048509/clean/api.h" -#include "kem/ntru/ntruhps2048509/avx2/api.h" -#include "kem/ntru/ntruhrss701/clean/api.h" -#include "kem/ntru/ntruhrss701/avx2/api.h" -#include "kem/ntru/ntruhps2048677/clean/api.h" -#include "kem/ntru/ntruhps2048677/avx2/api.h" -#include "kem/ntru_prime/ntrulpr761/clean/api.h" -#include "kem/ntru_prime/ntrulpr761/avx2/api.h" -#include "kem/ntru_prime/ntrulpr653/clean/api.h" -#include "kem/ntru_prime/ntrulpr653/avx2/api.h" -#include "kem/ntru_prime/ntrulpr857/clean/api.h" -#include "kem/ntru_prime/ntrulpr857/avx2/api.h" -#include "kem/kyber/kyber768/clean/api.h" -#include "kem/kyber/kyber768/avx2/api.h" -#include "kem/kyber/kyber1024/clean/api.h" -#include "kem/kyber/kyber1024/avx2/api.h" -#include "kem/kyber/kyber512/clean/api.h" -#include "kem/kyber/kyber512/avx2/api.h" -#include "kem/mceliece/mceliece460896f/avx/api.h" -#include "kem/mceliece/mceliece460896f/clean/api.h" -#include "kem/mceliece/mceliece8192128/avx/api.h" -#include "kem/mceliece/mceliece8192128/clean/api.h" -#include "kem/mceliece/mceliece6688128f/avx/api.h" -#include "kem/mceliece/mceliece6688128f/clean/api.h" -#include "kem/mceliece/mceliece8192128f/avx/api.h" -#include "kem/mceliece/mceliece8192128f/clean/api.h" -#include "kem/mceliece/mceliece6960119f/avx/api.h" -#include "kem/mceliece/mceliece6960119f/clean/api.h" -#include "kem/mceliece/mceliece460896/avx/api.h" -#include "kem/mceliece/mceliece460896/clean/api.h" -#include "kem/mceliece/mceliece6688128/avx/api.h" -#include "kem/mceliece/mceliece6688128/clean/api.h" -#include "kem/mceliece/mceliece348864f/avx/api.h" -#include "kem/mceliece/mceliece348864f/clean/api.h" -#include "kem/mceliece/mceliece6960119/avx/api.h" -#include "kem/mceliece/mceliece6960119/clean/api.h" -#include "kem/mceliece/mceliece348864/avx/api.h" -#include "kem/mceliece/mceliece348864/clean/api.h" -#include "kem/frodo/frodokem976shake/clean/api.h" -#include "kem/frodo/frodokem1344shake/clean/api.h" -#include "kem/frodo/frodokem640shake/clean/api.h" -#include "kem/saber/lightsaber/clean/api.h" -#include "kem/saber/lightsaber/avx2/api.h" -#include "kem/saber/firesaber/clean/api.h" -#include "kem/saber/firesaber/avx2/api.h" -#include "kem/saber/saber/clean/api.h" -#include "kem/saber/saber/avx2/api.h" -#include "kem/hqc/hqc-rmrs-128/clean/api.h" -#include "kem/hqc/hqc-rmrs-192/clean/api.h" -#include "kem/hqc/hqc-rmrs-256/clean/api.h" -#include "kem/hqc/hqc-rmrs-128/avx2/api.h" -#include "kem/hqc/hqc-rmrs-192/avx2/api.h" -#include "kem/hqc/hqc-rmrs-256/avx2/api.h" +#include "schemes.h" // not proud of this thingy #define OPT_VERSION _CLEAN_ diff --git a/src/capi/schemes.h b/src/capi/schemes.h new file mode 100644 index 00000000..60a68893 --- /dev/null +++ b/src/capi/schemes.h @@ -0,0 +1,118 @@ +// PQClean include +#include "sign/rainbow/rainbowV-classic/clean/api.h" +#include "sign/rainbow/rainbowI-classic/clean/api.h" +#include "sign/rainbow/rainbowIII-classic/clean/api.h" +#include "sign/sphincs/sphincs-sha256-192f-simple/clean/api.h" +#include "sign/sphincs/sphincs-sha256-192f-simple/avx2/api.h" +#include "sign/sphincs/sphincs-shake256-256f-simple/clean/api.h" +#include "sign/sphincs/sphincs-shake256-256f-simple/avx2/api.h" +#include "sign/sphincs/sphincs-shake256-192f-robust/clean/api.h" +#include "sign/sphincs/sphincs-shake256-192f-robust/avx2/api.h" +#include "sign/sphincs/sphincs-shake256-128f-simple/clean/api.h" +#include "sign/sphincs/sphincs-shake256-128f-simple/avx2/api.h" +#include "sign/sphincs/sphincs-shake256-256s-simple/clean/api.h" +#include "sign/sphincs/sphincs-shake256-256s-simple/avx2/api.h" +#include "sign/sphincs/sphincs-shake256-128s-simple/clean/api.h" +#include "sign/sphincs/sphincs-shake256-128s-simple/avx2/api.h" +#include "sign/sphincs/sphincs-sha256-128f-robust/clean/api.h" +#include "sign/sphincs/sphincs-sha256-128f-robust/avx2/api.h" +#include "sign/sphincs/sphincs-sha256-192s-robust/clean/api.h" +#include "sign/sphincs/sphincs-sha256-192s-robust/avx2/api.h" +#include "sign/sphincs/sphincs-shake256-128f-robust/clean/api.h" +#include "sign/sphincs/sphincs-shake256-128f-robust/avx2/api.h" +#include "sign/sphincs/sphincs-shake256-128s-robust/clean/api.h" +#include "sign/sphincs/sphincs-shake256-128s-robust/avx2/api.h" +#include "sign/sphincs/sphincs-shake256-256s-robust/clean/api.h" +#include "sign/sphincs/sphincs-shake256-256s-robust/avx2/api.h" +#include "sign/sphincs/sphincs-sha256-192s-simple/clean/api.h" +#include "sign/sphincs/sphincs-sha256-192s-simple/avx2/api.h" +#include "sign/sphincs/sphincs-shake256-192s-simple/clean/api.h" +#include "sign/sphincs/sphincs-shake256-192s-simple/avx2/api.h" +#include "sign/sphincs/sphincs-shake256-192s-robust/clean/api.h" +#include "sign/sphincs/sphincs-shake256-192s-robust/avx2/api.h" +#include "sign/sphincs/sphincs-shake256-192f-simple/clean/api.h" +#include "sign/sphincs/sphincs-shake256-192f-simple/avx2/api.h" +#include "sign/sphincs/sphincs-sha256-256s-simple/clean/api.h" +#include "sign/sphincs/sphincs-sha256-256s-simple/avx2/api.h" +#include "sign/sphincs/sphincs-sha256-128s-simple/clean/api.h" +#include "sign/sphincs/sphincs-sha256-128s-simple/avx2/api.h" +#include "sign/sphincs/sphincs-shake256-256f-robust/clean/api.h" +#include "sign/sphincs/sphincs-shake256-256f-robust/avx2/api.h" +#include "sign/sphincs/sphincs-sha256-256f-robust/clean/api.h" +#include "sign/sphincs/sphincs-sha256-256f-robust/avx2/api.h" +#include "sign/sphincs/sphincs-sha256-256f-simple/clean/api.h" +#include "sign/sphincs/sphincs-sha256-256f-simple/avx2/api.h" +#include "sign/sphincs/sphincs-sha256-256s-robust/clean/api.h" +#include "sign/sphincs/sphincs-sha256-256s-robust/avx2/api.h" +#include "sign/sphincs/sphincs-sha256-128s-robust/clean/api.h" +#include "sign/sphincs/sphincs-sha256-128s-robust/avx2/api.h" +#include "sign/sphincs/sphincs-sha256-128f-simple/clean/api.h" +#include "sign/sphincs/sphincs-sha256-128f-simple/avx2/api.h" +#include "sign/sphincs/sphincs-sha256-192f-robust/clean/api.h" +#include "sign/sphincs/sphincs-sha256-192f-robust/avx2/api.h" +#include "sign/falcon/falcon-1024/clean/api.h" +#include "sign/falcon/falcon-1024/avx2/api.h" +#include "sign/falcon/falcon-512/clean/api.h" +#include "sign/falcon/falcon-512/avx2/api.h" +#include "sign/dilithium/dilithium2/clean/api.h" +#include "sign/dilithium/dilithium2/avx2/api.h" +#include "sign/dilithium/dilithium3/clean/api.h" +#include "sign/dilithium/dilithium3/avx2/api.h" +#include "sign/dilithium/dilithium5/clean/api.h" +#include "sign/dilithium/dilithium5/avx2/api.h" +#include "kem/ntru/ntruhps4096821/clean/api.h" +#include "kem/ntru/ntruhps4096821/avx2/api.h" +#include "kem/ntru/ntruhps2048509/clean/api.h" +#include "kem/ntru/ntruhps2048509/avx2/api.h" +#include "kem/ntru/ntruhrss701/clean/api.h" +#include "kem/ntru/ntruhrss701/avx2/api.h" +#include "kem/ntru/ntruhps2048677/clean/api.h" +#include "kem/ntru/ntruhps2048677/avx2/api.h" +#include "kem/ntru_prime/ntrulpr761/clean/api.h" +#include "kem/ntru_prime/ntrulpr761/avx2/api.h" +#include "kem/ntru_prime/ntrulpr653/clean/api.h" +#include "kem/ntru_prime/ntrulpr653/avx2/api.h" +#include "kem/ntru_prime/ntrulpr857/clean/api.h" +#include "kem/ntru_prime/ntrulpr857/avx2/api.h" +#include "kem/kyber/kyber768/clean/api.h" +#include "kem/kyber/kyber768/avx2/api.h" +#include "kem/kyber/kyber1024/clean/api.h" +#include "kem/kyber/kyber1024/avx2/api.h" +#include "kem/kyber/kyber512/clean/api.h" +#include "kem/kyber/kyber512/avx2/api.h" +#include "kem/mceliece/mceliece460896f/avx/api.h" +#include "kem/mceliece/mceliece460896f/clean/api.h" +#include "kem/mceliece/mceliece8192128/avx/api.h" +#include "kem/mceliece/mceliece8192128/clean/api.h" +#include "kem/mceliece/mceliece6688128f/avx/api.h" +#include "kem/mceliece/mceliece6688128f/clean/api.h" +#include "kem/mceliece/mceliece8192128f/avx/api.h" +#include "kem/mceliece/mceliece8192128f/clean/api.h" +#include "kem/mceliece/mceliece6960119f/avx/api.h" +#include "kem/mceliece/mceliece6960119f/clean/api.h" +#include "kem/mceliece/mceliece460896/avx/api.h" +#include "kem/mceliece/mceliece460896/clean/api.h" +#include "kem/mceliece/mceliece6688128/avx/api.h" +#include "kem/mceliece/mceliece6688128/clean/api.h" +#include "kem/mceliece/mceliece348864f/avx/api.h" +#include "kem/mceliece/mceliece348864f/clean/api.h" +#include "kem/mceliece/mceliece6960119/avx/api.h" +#include "kem/mceliece/mceliece6960119/clean/api.h" +#include "kem/mceliece/mceliece348864/avx/api.h" +#include "kem/mceliece/mceliece348864/clean/api.h" +#include "kem/frodo/frodokem976shake/clean/api.h" +#include "kem/frodo/frodokem1344shake/clean/api.h" +#include "kem/frodo/frodokem640shake/clean/api.h" +#include "kem/saber/lightsaber/clean/api.h" +#include "kem/saber/lightsaber/avx2/api.h" +#include "kem/saber/firesaber/clean/api.h" +#include "kem/saber/firesaber/avx2/api.h" +#include "kem/saber/saber/clean/api.h" +#include "kem/saber/saber/avx2/api.h" +#include "kem/hqc/hqc-rmrs-128/clean/api.h" +#include "kem/hqc/hqc-rmrs-192/clean/api.h" +#include "kem/hqc/hqc-rmrs-256/clean/api.h" +#include "kem/hqc/hqc-rmrs-128/avx2/api.h" +#include "kem/hqc/hqc-rmrs-192/avx2/api.h" +#include "kem/hqc/hqc-rmrs-256/avx2/api.h" +#include "kem/sike/includes/sike/sike.h" \ No newline at end of file diff --git a/src/kem/sike/includes/sike/sike.h b/src/kem/sike/includes/sike/sike.h new file mode 100644 index 00000000..09d1e580 --- /dev/null +++ b/src/kem/sike/includes/sike/sike.h @@ -0,0 +1,73 @@ +#ifndef SIKE_H_ +#define SIKE_H_ + +#include +#include + +/* SIKE + * + * SIKE is a isogeny based post-quantum key encapsulation mechanism. Description of the + * algorithm is provided in [SIKE]. This implementation uses 434-bit field size. The code + * is based on "Additional_Implementations" from PQC NIST submission package which can + * be found here: + * https://csrc.nist.gov/CSRC/media/Projects/Post-Quantum-Cryptography/documents/round-1/submissions/SIKE.zip + * + * [SIKE] https://sike.org/files/SIDH-spec.pdf + */ + +// SIKE_PUB_BYTESZ is the number of bytes in a public key. +#define SIKE_PUB_BYTESZ 330 +// SIKE_PRV_BYTESZ is the number of bytes in a private key. +#define SIKE_PRV_BYTESZ 28 +// SIKE_SS_BYTESZ is the number of bytes in a shared key. +#define SIKE_SS_BYTESZ 16 +// SIKE_MSG_BYTESZ is the number of bytes in a random bit string concatenated +// with the public key (see 1.4 of SIKE). +#define SIKE_MSG_BYTESZ 16 +// SIKE_SS_BYTESZ is the number of bytes in a ciphertext. +#define SIKE_CT_BYTESZ (SIKE_PUB_BYTESZ + SIKE_MSG_BYTESZ) + +// SIKE_keypair outputs a public and secret key. In case of success +// function returns 1, otherwise 0. + int SIKE_keypair( + uint8_t out_priv[SIKE_PRV_BYTESZ], + uint8_t out_pub[SIKE_PUB_BYTESZ]); + +// SIKE_encaps generates and encrypts a random session key, writing those values to +// |out_shared_key| and |out_ciphertext|, respectively. + void SIKE_encaps( + uint8_t out_shared_key[SIKE_SS_BYTESZ], + uint8_t out_ciphertext[SIKE_CT_BYTESZ], + const uint8_t pub_key[SIKE_PUB_BYTESZ]); + +// SIKE_decaps outputs a random session key, writing it to |out_shared_key|. + void SIKE_decaps( + uint8_t out_shared_key[SIKE_SS_BYTESZ], + const uint8_t ciphertext[SIKE_CT_BYTESZ], + const uint8_t pub_key[SIKE_PUB_BYTESZ], + const uint8_t priv_key[SIKE_PRV_BYTESZ]); + +// boilerplate needed for integration +#define PQCLEAN_SIKE434_CLEAN_CRYPTO_SECRETKEYBYTES SIKE_PRV_BYTESZ +#define PQCLEAN_SIKE434_CLEAN_CRYPTO_PUBLICKEYBYTES SIKE_PUB_BYTESZ +#define PQCLEAN_SIKE434_CLEAN_CRYPTO_CIPHERTEXTBYTES SIKE_CT_BYTESZ +#define PQCLEAN_SIKE434_CLEAN_CRYPTO_BYTES SIKE_SS_BYTESZ +#define PQCLEAN_SIKE434_CLEAN_CRYPTO_ALGNAME "SIKE/p434" + +static inline int PQCLEAN_SIKE434_CLEAN_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) { + SIKE_keypair(sk, pk); + memcpy(&sk[SIKE_PRV_BYTESZ+SIKE_MSG_BYTESZ], pk, SIKE_PUB_BYTESZ); + return 1; +} +static inline int PQCLEAN_SIKE434_CLEAN_crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { + SIKE_encaps(ss,ct,pk); + return 1; +} + +static inline int PQCLEAN_SIKE434_CLEAN_crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { + SIKE_decaps(ss, ct, &sk[SIKE_PRV_BYTESZ+SIKE_MSG_BYTESZ], sk); + return 1; +} + + +#endif diff --git a/src/kem/sike/sike-p434-sha256/asm/fp-x86_64.S b/src/kem/sike/sike-p434-sha256/asm/fp-x86_64.S deleted file mode 100644 index 4e2d7b74..00000000 --- a/src/kem/sike/sike-p434-sha256/asm/fp-x86_64.S +++ /dev/null @@ -1,1095 +0,0 @@ -# This file is generated from a similarly-named Perl script in the BoringSSL -# source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text - - -.Lp434x2: -.quad 0xFFFFFFFFFFFFFFFE -.quad 0xFFFFFFFFFFFFFFFF -.quad 0xFB82ECF5C5FFFFFF -.quad 0xF78CB8F062B15D47 -.quad 0xD9F8BFAD038A40AC -.quad 0x0004683E4E2EE688 - - -.Lp434p1: -.quad 0xFDC1767AE3000000 -.quad 0x7BC65C783158AEA3 -.quad 0x6CFC5FD681C52056 -.quad 0x0002341F27177344 - -.globl sike_fpadd -.hidden sike_fpadd -.type sike_fpadd,@function -sike_fpadd: -.cfi_startproc - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset r12, -16 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset r13, -24 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset r14, -32 - - xorq %rax,%rax - - movq 0(%rdi),%r8 - addq 0(%rsi),%r8 - movq 8(%rdi),%r9 - adcq 8(%rsi),%r9 - movq 16(%rdi),%r10 - adcq 16(%rsi),%r10 - movq 24(%rdi),%r11 - adcq 24(%rsi),%r11 - movq 32(%rdi),%r12 - adcq 32(%rsi),%r12 - movq 40(%rdi),%r13 - adcq 40(%rsi),%r13 - movq 48(%rdi),%r14 - adcq 48(%rsi),%r14 - - movq .Lp434x2(%rip),%rcx - subq %rcx,%r8 - movq 8+.Lp434x2(%rip),%rcx - sbbq %rcx,%r9 - sbbq %rcx,%r10 - movq 16+.Lp434x2(%rip),%rcx - sbbq %rcx,%r11 - movq 24+.Lp434x2(%rip),%rcx - sbbq %rcx,%r12 - movq 32+.Lp434x2(%rip),%rcx - sbbq %rcx,%r13 - movq 40+.Lp434x2(%rip),%rcx - sbbq %rcx,%r14 - - sbbq $0,%rax - - movq .Lp434x2(%rip),%rdi - andq %rax,%rdi - movq 8+.Lp434x2(%rip),%rsi - andq %rax,%rsi - movq 16+.Lp434x2(%rip),%rcx - andq %rax,%rcx - - addq %rdi,%r8 - movq %r8,0(%rdx) - adcq %rsi,%r9 - movq %r9,8(%rdx) - adcq %rsi,%r10 - movq %r10,16(%rdx) - adcq %rcx,%r11 - movq %r11,24(%rdx) - - setc %cl - movq 24+.Lp434x2(%rip),%r8 - andq %rax,%r8 - movq 32+.Lp434x2(%rip),%r9 - andq %rax,%r9 - movq 40+.Lp434x2(%rip),%r10 - andq %rax,%r10 - btq $0,%rcx - - adcq %r8,%r12 - movq %r12,32(%rdx) - adcq %r9,%r13 - movq %r13,40(%rdx) - adcq %r10,%r14 - movq %r14,48(%rdx) - - popq %r14 -.cfi_adjust_cfa_offset -8 - popq %r13 -.cfi_adjust_cfa_offset -8 - popq %r12 -.cfi_adjust_cfa_offset -8 - .byte 0xf3,0xc3 -.cfi_endproc -.globl sike_cswap_asm -.hidden sike_cswap_asm -.type sike_cswap_asm,@function -sike_cswap_asm: - - - movq %rdx,%xmm3 - - - - - - pshufd $68,%xmm3,%xmm3 - - movdqu 0(%rdi),%xmm0 - movdqu 0(%rsi),%xmm1 - movdqa %xmm1,%xmm2 - pxor %xmm0,%xmm2 - pand %xmm3,%xmm2 - pxor %xmm2,%xmm0 - pxor %xmm2,%xmm1 - movdqu %xmm0,0(%rdi) - movdqu %xmm1,0(%rsi) - - movdqu 16(%rdi),%xmm0 - movdqu 16(%rsi),%xmm1 - movdqa %xmm1,%xmm2 - pxor %xmm0,%xmm2 - pand %xmm3,%xmm2 - pxor %xmm2,%xmm0 - pxor %xmm2,%xmm1 - movdqu %xmm0,16(%rdi) - movdqu %xmm1,16(%rsi) - - movdqu 32(%rdi),%xmm0 - movdqu 32(%rsi),%xmm1 - movdqa %xmm1,%xmm2 - pxor %xmm0,%xmm2 - pand %xmm3,%xmm2 - pxor %xmm2,%xmm0 - pxor %xmm2,%xmm1 - movdqu %xmm0,32(%rdi) - movdqu %xmm1,32(%rsi) - - movdqu 48(%rdi),%xmm0 - movdqu 48(%rsi),%xmm1 - movdqa %xmm1,%xmm2 - pxor %xmm0,%xmm2 - pand %xmm3,%xmm2 - pxor %xmm2,%xmm0 - pxor %xmm2,%xmm1 - movdqu %xmm0,48(%rdi) - movdqu %xmm1,48(%rsi) - - movdqu 64(%rdi),%xmm0 - movdqu 64(%rsi),%xmm1 - movdqa %xmm1,%xmm2 - pxor %xmm0,%xmm2 - pand %xmm3,%xmm2 - pxor %xmm2,%xmm0 - pxor %xmm2,%xmm1 - movdqu %xmm0,64(%rdi) - movdqu %xmm1,64(%rsi) - - movdqu 80(%rdi),%xmm0 - movdqu 80(%rsi),%xmm1 - movdqa %xmm1,%xmm2 - pxor %xmm0,%xmm2 - pand %xmm3,%xmm2 - pxor %xmm2,%xmm0 - pxor %xmm2,%xmm1 - movdqu %xmm0,80(%rdi) - movdqu %xmm1,80(%rsi) - - movdqu 96(%rdi),%xmm0 - movdqu 96(%rsi),%xmm1 - movdqa %xmm1,%xmm2 - pxor %xmm0,%xmm2 - pand %xmm3,%xmm2 - pxor %xmm2,%xmm0 - pxor %xmm2,%xmm1 - movdqu %xmm0,96(%rdi) - movdqu %xmm1,96(%rsi) - - movdqu 112(%rdi),%xmm0 - movdqu 112(%rsi),%xmm1 - movdqa %xmm1,%xmm2 - pxor %xmm0,%xmm2 - pand %xmm3,%xmm2 - pxor %xmm2,%xmm0 - pxor %xmm2,%xmm1 - movdqu %xmm0,112(%rdi) - movdqu %xmm1,112(%rsi) - - movdqu 128(%rdi),%xmm0 - movdqu 128(%rsi),%xmm1 - movdqa %xmm1,%xmm2 - pxor %xmm0,%xmm2 - pand %xmm3,%xmm2 - pxor %xmm2,%xmm0 - pxor %xmm2,%xmm1 - movdqu %xmm0,128(%rdi) - movdqu %xmm1,128(%rsi) - - movdqu 144(%rdi),%xmm0 - movdqu 144(%rsi),%xmm1 - movdqa %xmm1,%xmm2 - pxor %xmm0,%xmm2 - pand %xmm3,%xmm2 - pxor %xmm2,%xmm0 - pxor %xmm2,%xmm1 - movdqu %xmm0,144(%rdi) - movdqu %xmm1,144(%rsi) - - movdqu 160(%rdi),%xmm0 - movdqu 160(%rsi),%xmm1 - movdqa %xmm1,%xmm2 - pxor %xmm0,%xmm2 - pand %xmm3,%xmm2 - pxor %xmm2,%xmm0 - pxor %xmm2,%xmm1 - movdqu %xmm0,160(%rdi) - movdqu %xmm1,160(%rsi) - - movdqu 176(%rdi),%xmm0 - movdqu 176(%rsi),%xmm1 - movdqa %xmm1,%xmm2 - pxor %xmm0,%xmm2 - pand %xmm3,%xmm2 - pxor %xmm2,%xmm0 - pxor %xmm2,%xmm1 - movdqu %xmm0,176(%rdi) - movdqu %xmm1,176(%rsi) - - movdqu 192(%rdi),%xmm0 - movdqu 192(%rsi),%xmm1 - movdqa %xmm1,%xmm2 - pxor %xmm0,%xmm2 - pand %xmm3,%xmm2 - pxor %xmm2,%xmm0 - pxor %xmm2,%xmm1 - movdqu %xmm0,192(%rdi) - movdqu %xmm1,192(%rsi) - - movdqu 208(%rdi),%xmm0 - movdqu 208(%rsi),%xmm1 - movdqa %xmm1,%xmm2 - pxor %xmm0,%xmm2 - pand %xmm3,%xmm2 - pxor %xmm2,%xmm0 - pxor %xmm2,%xmm1 - movdqu %xmm0,208(%rdi) - movdqu %xmm1,208(%rsi) - - .byte 0xf3,0xc3 -.globl sike_fpsub -.hidden sike_fpsub -.type sike_fpsub,@function -sike_fpsub: -.cfi_startproc - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset r12, -16 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset r13, -24 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset r14, -32 - - xorq %rax,%rax - - movq 0(%rdi),%r8 - subq 0(%rsi),%r8 - movq 8(%rdi),%r9 - sbbq 8(%rsi),%r9 - movq 16(%rdi),%r10 - sbbq 16(%rsi),%r10 - movq 24(%rdi),%r11 - sbbq 24(%rsi),%r11 - movq 32(%rdi),%r12 - sbbq 32(%rsi),%r12 - movq 40(%rdi),%r13 - sbbq 40(%rsi),%r13 - movq 48(%rdi),%r14 - sbbq 48(%rsi),%r14 - - sbbq $0x0,%rax - - movq .Lp434x2(%rip),%rdi - andq %rax,%rdi - movq 8+.Lp434x2(%rip),%rsi - andq %rax,%rsi - movq 16+.Lp434x2(%rip),%rcx - andq %rax,%rcx - - addq %rdi,%r8 - movq %r8,0(%rdx) - adcq %rsi,%r9 - movq %r9,8(%rdx) - adcq %rsi,%r10 - movq %r10,16(%rdx) - adcq %rcx,%r11 - movq %r11,24(%rdx) - - setc %cl - movq 24+.Lp434x2(%rip),%r8 - andq %rax,%r8 - movq 32+.Lp434x2(%rip),%r9 - andq %rax,%r9 - movq 40+.Lp434x2(%rip),%r10 - andq %rax,%r10 - btq $0x0,%rcx - - adcq %r8,%r12 - adcq %r9,%r13 - adcq %r10,%r14 - movq %r12,32(%rdx) - movq %r13,40(%rdx) - movq %r14,48(%rdx) - - popq %r14 -.cfi_adjust_cfa_offset -8 - popq %r13 -.cfi_adjust_cfa_offset -8 - popq %r12 -.cfi_adjust_cfa_offset -8 - .byte 0xf3,0xc3 -.cfi_endproc -.globl sike_mpadd_asm -.hidden sike_mpadd_asm -.type sike_mpadd_asm,@function -sike_mpadd_asm: -.cfi_startproc - movq 0(%rdi),%r8; - movq 8(%rdi),%r9 - movq 16(%rdi),%r10 - movq 24(%rdi),%r11 - movq 32(%rdi),%rcx - addq 0(%rsi),%r8 - adcq 8(%rsi),%r9 - adcq 16(%rsi),%r10 - adcq 24(%rsi),%r11 - adcq 32(%rsi),%rcx - movq %r8,0(%rdx) - movq %r9,8(%rdx) - movq %r10,16(%rdx) - movq %r11,24(%rdx) - movq %rcx,32(%rdx) - - movq 40(%rdi),%r8 - movq 48(%rdi),%r9 - adcq 40(%rsi),%r8 - adcq 48(%rsi),%r9 - movq %r8,40(%rdx) - movq %r9,48(%rdx) - .byte 0xf3,0xc3 -.cfi_endproc -.globl sike_mpsubx2_asm -.hidden sike_mpsubx2_asm -.type sike_mpsubx2_asm,@function -sike_mpsubx2_asm: -.cfi_startproc - xorq %rax,%rax - - movq 0(%rdi),%r8 - movq 8(%rdi),%r9 - movq 16(%rdi),%r10 - movq 24(%rdi),%r11 - movq 32(%rdi),%rcx - subq 0(%rsi),%r8 - sbbq 8(%rsi),%r9 - sbbq 16(%rsi),%r10 - sbbq 24(%rsi),%r11 - sbbq 32(%rsi),%rcx - movq %r8,0(%rdx) - movq %r9,8(%rdx) - movq %r10,16(%rdx) - movq %r11,24(%rdx) - movq %rcx,32(%rdx) - - movq 40(%rdi),%r8 - movq 48(%rdi),%r9 - movq 56(%rdi),%r10 - movq 64(%rdi),%r11 - movq 72(%rdi),%rcx - sbbq 40(%rsi),%r8 - sbbq 48(%rsi),%r9 - sbbq 56(%rsi),%r10 - sbbq 64(%rsi),%r11 - sbbq 72(%rsi),%rcx - movq %r8,40(%rdx) - movq %r9,48(%rdx) - movq %r10,56(%rdx) - movq %r11,64(%rdx) - movq %rcx,72(%rdx) - - movq 80(%rdi),%r8 - movq 88(%rdi),%r9 - movq 96(%rdi),%r10 - movq 104(%rdi),%r11 - sbbq 80(%rsi),%r8 - sbbq 88(%rsi),%r9 - sbbq 96(%rsi),%r10 - sbbq 104(%rsi),%r11 - sbbq $0x0,%rax - movq %r8,80(%rdx) - movq %r9,88(%rdx) - movq %r10,96(%rdx) - movq %r11,104(%rdx) - .byte 0xf3,0xc3 -.cfi_endproc -.globl sike_mpdblsubx2_asm -.hidden sike_mpdblsubx2_asm -.type sike_mpdblsubx2_asm,@function -sike_mpdblsubx2_asm: -.cfi_startproc - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset r12, -16 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset r13, -24 - - xorq %rax,%rax - - - movq 0(%rdx),%r8 - movq 8(%rdx),%r9 - movq 16(%rdx),%r10 - movq 24(%rdx),%r11 - movq 32(%rdx),%r12 - movq 40(%rdx),%r13 - movq 48(%rdx),%rcx - subq 0(%rdi),%r8 - sbbq 8(%rdi),%r9 - sbbq 16(%rdi),%r10 - sbbq 24(%rdi),%r11 - sbbq 32(%rdi),%r12 - sbbq 40(%rdi),%r13 - sbbq 48(%rdi),%rcx - adcq $0x0,%rax - - - subq 0(%rsi),%r8 - sbbq 8(%rsi),%r9 - sbbq 16(%rsi),%r10 - sbbq 24(%rsi),%r11 - sbbq 32(%rsi),%r12 - sbbq 40(%rsi),%r13 - sbbq 48(%rsi),%rcx - adcq $0x0,%rax - - - movq %r8,0(%rdx) - movq %r9,8(%rdx) - movq %r10,16(%rdx) - movq %r11,24(%rdx) - movq %r12,32(%rdx) - movq %r13,40(%rdx) - movq %rcx,48(%rdx) - - - movq 56(%rdx),%r8 - movq 64(%rdx),%r9 - movq 72(%rdx),%r10 - movq 80(%rdx),%r11 - movq 88(%rdx),%r12 - movq 96(%rdx),%r13 - movq 104(%rdx),%rcx - - subq %rax,%r8 - sbbq 56(%rdi),%r8 - sbbq 64(%rdi),%r9 - sbbq 72(%rdi),%r10 - sbbq 80(%rdi),%r11 - sbbq 88(%rdi),%r12 - sbbq 96(%rdi),%r13 - sbbq 104(%rdi),%rcx - - - subq 56(%rsi),%r8 - sbbq 64(%rsi),%r9 - sbbq 72(%rsi),%r10 - sbbq 80(%rsi),%r11 - sbbq 88(%rsi),%r12 - sbbq 96(%rsi),%r13 - sbbq 104(%rsi),%rcx - - - movq %r8,56(%rdx) - movq %r9,64(%rdx) - movq %r10,72(%rdx) - movq %r11,80(%rdx) - movq %r12,88(%rdx) - movq %r13,96(%rdx) - movq %rcx,104(%rdx) - - popq %r13 -.cfi_adjust_cfa_offset -8 - popq %r12 -.cfi_adjust_cfa_offset -8 - .byte 0xf3,0xc3 -.cfi_endproc - -.globl sike_fprdc -.hidden sike_fprdc -.type sike_fprdc,@function -sike_fprdc: -.cfi_startproc - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset r12, -16 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset r13, -24 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset r14, -32 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset r15, -40 - - xorq %rax,%rax - movq 0+0(%rdi),%rdx - mulxq 0+.Lp434p1(%rip),%r8,%r9 - mulxq 8+.Lp434p1(%rip),%r12,%r10 - mulxq 16+.Lp434p1(%rip),%r13,%r11 - - adoxq %r12,%r9 - adoxq %r13,%r10 - - mulxq 24+.Lp434p1(%rip),%r13,%r12 - adoxq %r13,%r11 - adoxq %rax,%r12 - - xorq %rax,%rax - movq 0+8(%rdi),%rdx - mulxq 0+.Lp434p1(%rip),%r13,%rcx - adcxq %r13,%r9 - adcxq %rcx,%r10 - - mulxq 8+.Lp434p1(%rip),%rcx,%r13 - adcxq %r13,%r11 - adoxq %rcx,%r10 - - mulxq 16+.Lp434p1(%rip),%rcx,%r13 - adcxq %r13,%r12 - adoxq %rcx,%r11 - - mulxq 24+.Lp434p1(%rip),%rcx,%r13 - adcxq %rax,%r13 - adoxq %rcx,%r12 - adoxq %rax,%r13 - - xorq %rcx,%rcx - addq 24(%rdi),%r8 - adcq 32(%rdi),%r9 - adcq 40(%rdi),%r10 - adcq 48(%rdi),%r11 - adcq 56(%rdi),%r12 - adcq 64(%rdi),%r13 - adcq 72(%rdi),%rcx - movq %r8,24(%rdi) - movq %r9,32(%rdi) - movq %r10,40(%rdi) - movq %r11,48(%rdi) - movq %r12,56(%rdi) - movq %r13,64(%rdi) - movq %rcx,72(%rdi) - movq 80(%rdi),%r8 - movq 88(%rdi),%r9 - movq 96(%rdi),%r10 - movq 104(%rdi),%r11 - adcq $0x0,%r8 - adcq $0x0,%r9 - adcq $0x0,%r10 - adcq $0x0,%r11 - movq %r8,80(%rdi) - movq %r9,88(%rdi) - movq %r10,96(%rdi) - movq %r11,104(%rdi) - - xorq %rax,%rax - movq 16+0(%rdi),%rdx - mulxq 0+.Lp434p1(%rip),%r8,%r9 - mulxq 8+.Lp434p1(%rip),%r12,%r10 - mulxq 16+.Lp434p1(%rip),%r13,%r11 - - adoxq %r12,%r9 - adoxq %r13,%r10 - - mulxq 24+.Lp434p1(%rip),%r13,%r12 - adoxq %r13,%r11 - adoxq %rax,%r12 - - xorq %rax,%rax - movq 16+8(%rdi),%rdx - mulxq 0+.Lp434p1(%rip),%r13,%rcx - adcxq %r13,%r9 - adcxq %rcx,%r10 - - mulxq 8+.Lp434p1(%rip),%rcx,%r13 - adcxq %r13,%r11 - adoxq %rcx,%r10 - - mulxq 16+.Lp434p1(%rip),%rcx,%r13 - adcxq %r13,%r12 - adoxq %rcx,%r11 - - mulxq 24+.Lp434p1(%rip),%rcx,%r13 - adcxq %rax,%r13 - adoxq %rcx,%r12 - adoxq %rax,%r13 - - xorq %rcx,%rcx - addq 40(%rdi),%r8 - adcq 48(%rdi),%r9 - adcq 56(%rdi),%r10 - adcq 64(%rdi),%r11 - adcq 72(%rdi),%r12 - adcq 80(%rdi),%r13 - adcq 88(%rdi),%rcx - movq %r8,40(%rdi) - movq %r9,48(%rdi) - movq %r10,56(%rdi) - movq %r11,64(%rdi) - movq %r12,72(%rdi) - movq %r13,80(%rdi) - movq %rcx,88(%rdi) - movq 96(%rdi),%r8 - movq 104(%rdi),%r9 - adcq $0x0,%r8 - adcq $0x0,%r9 - movq %r8,96(%rdi) - movq %r9,104(%rdi) - - xorq %rax,%rax - movq 32+0(%rdi),%rdx - mulxq 0+.Lp434p1(%rip),%r8,%r9 - mulxq 8+.Lp434p1(%rip),%r12,%r10 - mulxq 16+.Lp434p1(%rip),%r13,%r11 - - adoxq %r12,%r9 - adoxq %r13,%r10 - - mulxq 24+.Lp434p1(%rip),%r13,%r12 - adoxq %r13,%r11 - adoxq %rax,%r12 - - xorq %rax,%rax - movq 32+8(%rdi),%rdx - mulxq 0+.Lp434p1(%rip),%r13,%rcx - adcxq %r13,%r9 - adcxq %rcx,%r10 - - mulxq 8+.Lp434p1(%rip),%rcx,%r13 - adcxq %r13,%r11 - adoxq %rcx,%r10 - - mulxq 16+.Lp434p1(%rip),%rcx,%r13 - adcxq %r13,%r12 - adoxq %rcx,%r11 - - mulxq 24+.Lp434p1(%rip),%rcx,%r13 - adcxq %rax,%r13 - adoxq %rcx,%r12 - adoxq %rax,%r13 - - xorq %rcx,%rcx - addq 56(%rdi),%r8 - adcq 64(%rdi),%r9 - adcq 72(%rdi),%r10 - adcq 80(%rdi),%r11 - adcq 88(%rdi),%r12 - adcq 96(%rdi),%r13 - adcq 104(%rdi),%rcx - movq %r8,0(%rsi) - movq %r9,8(%rsi) - movq %r10,72(%rdi) - movq %r11,80(%rdi) - movq %r12,88(%rdi) - movq %r13,96(%rdi) - movq %rcx,104(%rdi) - - xorq %rax,%rax - movq 48(%rdi),%rdx - mulxq 0+.Lp434p1(%rip),%r8,%r9 - mulxq 8+.Lp434p1(%rip),%r12,%r10 - mulxq 16+.Lp434p1(%rip),%r13,%r11 - - adoxq %r12,%r9 - adoxq %r13,%r10 - - mulxq 24+.Lp434p1(%rip),%r13,%r12 - adoxq %r13,%r11 - adoxq %rax,%r12 - - addq 72(%rdi),%r8 - adcq 80(%rdi),%r9 - adcq 88(%rdi),%r10 - adcq 96(%rdi),%r11 - adcq 104(%rdi),%r12 - movq %r8,16(%rsi) - movq %r9,24(%rsi) - movq %r10,32(%rsi) - movq %r11,40(%rsi) - movq %r12,48(%rsi) - - - popq %r15 -.cfi_adjust_cfa_offset -8 - popq %r14 -.cfi_adjust_cfa_offset -8 - popq %r13 -.cfi_adjust_cfa_offset -8 - popq %r12 -.cfi_adjust_cfa_offset -8 - .byte 0xf3,0xc3 -.cfi_endproc -.globl sike_mpmul -.hidden sike_mpmul -.type sike_mpmul,@function -sike_mpmul: -.cfi_startproc - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset r12, -16 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset r13, -24 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset r14, -32 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset r15, -40 - - - movq %rdx,%rcx - xorq %rax,%rax - - - movq 0(%rdi),%r8 - movq 8(%rdi),%r9 - movq 16(%rdi),%r10 - movq 24(%rdi),%r11 - - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset rbx, -48 - pushq %rbp -.cfi_offset rbp, -56 -.cfi_adjust_cfa_offset 8 - subq $96,%rsp -.cfi_adjust_cfa_offset 96 - - addq 32(%rdi),%r8 - adcq 40(%rdi),%r9 - adcq 48(%rdi),%r10 - adcq $0x0,%r11 - sbbq $0x0,%rax - movq %r8,0(%rsp) - movq %r9,8(%rsp) - movq %r10,16(%rsp) - movq %r11,24(%rsp) - - - xorq %rbx,%rbx - movq 0(%rsi),%r12 - movq 8(%rsi),%r13 - movq 16(%rsi),%r14 - movq 24(%rsi),%r15 - addq 32(%rsi),%r12 - adcq 40(%rsi),%r13 - adcq 48(%rsi),%r14 - adcq $0x0,%r15 - sbbq $0x0,%rbx - movq %r12,32(%rsp) - movq %r13,40(%rsp) - movq %r14,48(%rsp) - movq %r15,56(%rsp) - - - andq %rax,%r12 - andq %rax,%r13 - andq %rax,%r14 - andq %rax,%r15 - - - andq %rbx,%r8 - andq %rbx,%r9 - andq %rbx,%r10 - andq %rbx,%r11 - - - addq %r12,%r8 - adcq %r13,%r9 - adcq %r14,%r10 - adcq %r15,%r11 - movq %r8,64(%rsp) - movq %r9,72(%rsp) - movq %r10,80(%rsp) - movq %r11,88(%rsp) - - - movq 0+0(%rsp),%rdx - mulxq 32+0(%rsp),%r9,%r8 - movq %r9,0+0(%rsp) - mulxq 32+8(%rsp),%r10,%r9 - xorq %rax,%rax - adoxq %r10,%r8 - mulxq 32+16(%rsp),%r11,%r10 - adoxq %r11,%r9 - mulxq 32+24(%rsp),%r12,%r11 - adoxq %r12,%r10 - - movq 0+8(%rsp),%rdx - mulxq 32+0(%rsp),%r12,%r13 - adoxq %rax,%r11 - xorq %rax,%rax - mulxq 32+8(%rsp),%r15,%r14 - adoxq %r8,%r12 - movq %r12,0+8(%rsp) - adcxq %r15,%r13 - mulxq 32+16(%rsp),%rbx,%r15 - adcxq %rbx,%r14 - adoxq %r9,%r13 - mulxq 32+24(%rsp),%rbp,%rbx - adcxq %rbp,%r15 - adcxq %rax,%rbx - adoxq %r10,%r14 - - movq 0+16(%rsp),%rdx - mulxq 32+0(%rsp),%r8,%r9 - adoxq %r11,%r15 - adoxq %rax,%rbx - xorq %rax,%rax - mulxq 32+8(%rsp),%r11,%r10 - adoxq %r13,%r8 - movq %r8,0+16(%rsp) - adcxq %r11,%r9 - mulxq 32+16(%rsp),%r12,%r11 - adcxq %r12,%r10 - adoxq %r14,%r9 - mulxq 32+24(%rsp),%rbp,%r12 - adcxq %rbp,%r11 - adcxq %rax,%r12 - - adoxq %r15,%r10 - adoxq %rbx,%r11 - adoxq %rax,%r12 - - movq 0+24(%rsp),%rdx - mulxq 32+0(%rsp),%r8,%r13 - xorq %rax,%rax - mulxq 32+8(%rsp),%r15,%r14 - adcxq %r15,%r13 - adoxq %r8,%r9 - mulxq 32+16(%rsp),%rbx,%r15 - adcxq %rbx,%r14 - adoxq %r13,%r10 - mulxq 32+24(%rsp),%rbp,%rbx - adcxq %rbp,%r15 - adcxq %rax,%rbx - adoxq %r14,%r11 - adoxq %r15,%r12 - adoxq %rax,%rbx - movq %r9,0+24(%rsp) - movq %r10,0+32(%rsp) - movq %r11,0+40(%rsp) - movq %r12,0+48(%rsp) - movq %rbx,0+56(%rsp) - - - - movq 0+0(%rdi),%rdx - mulxq 0+0(%rsi),%r9,%r8 - movq %r9,0+0(%rcx) - mulxq 0+8(%rsi),%r10,%r9 - xorq %rax,%rax - adoxq %r10,%r8 - mulxq 0+16(%rsi),%r11,%r10 - adoxq %r11,%r9 - mulxq 0+24(%rsi),%r12,%r11 - adoxq %r12,%r10 - - movq 0+8(%rdi),%rdx - mulxq 0+0(%rsi),%r12,%r13 - adoxq %rax,%r11 - xorq %rax,%rax - mulxq 0+8(%rsi),%r15,%r14 - adoxq %r8,%r12 - movq %r12,0+8(%rcx) - adcxq %r15,%r13 - mulxq 0+16(%rsi),%rbx,%r15 - adcxq %rbx,%r14 - adoxq %r9,%r13 - mulxq 0+24(%rsi),%rbp,%rbx - adcxq %rbp,%r15 - adcxq %rax,%rbx - adoxq %r10,%r14 - - movq 0+16(%rdi),%rdx - mulxq 0+0(%rsi),%r8,%r9 - adoxq %r11,%r15 - adoxq %rax,%rbx - xorq %rax,%rax - mulxq 0+8(%rsi),%r11,%r10 - adoxq %r13,%r8 - movq %r8,0+16(%rcx) - adcxq %r11,%r9 - mulxq 0+16(%rsi),%r12,%r11 - adcxq %r12,%r10 - adoxq %r14,%r9 - mulxq 0+24(%rsi),%rbp,%r12 - adcxq %rbp,%r11 - adcxq %rax,%r12 - - adoxq %r15,%r10 - adoxq %rbx,%r11 - adoxq %rax,%r12 - - movq 0+24(%rdi),%rdx - mulxq 0+0(%rsi),%r8,%r13 - xorq %rax,%rax - mulxq 0+8(%rsi),%r15,%r14 - adcxq %r15,%r13 - adoxq %r8,%r9 - mulxq 0+16(%rsi),%rbx,%r15 - adcxq %rbx,%r14 - adoxq %r13,%r10 - mulxq 0+24(%rsi),%rbp,%rbx - adcxq %rbp,%r15 - adcxq %rax,%rbx - adoxq %r14,%r11 - adoxq %r15,%r12 - adoxq %rax,%rbx - movq %r9,0+24(%rcx) - movq %r10,0+32(%rcx) - movq %r11,0+40(%rcx) - movq %r12,0+48(%rcx) - movq %rbx,0+56(%rcx) - - - - movq 32+0(%rdi),%rdx - mulxq 32+0(%rsi),%r9,%r8 - movq %r9,64+0(%rcx) - mulxq 32+8(%rsi),%r10,%r9 - xorq %rax,%rax - adoxq %r10,%r8 - mulxq 32+16(%rsi),%r11,%r10 - adoxq %r11,%r9 - - movq 32+8(%rdi),%rdx - mulxq 32+0(%rsi),%r12,%r11 - adoxq %rax,%r10 - xorq %rax,%rax - - mulxq 32+8(%rsi),%r14,%r13 - adoxq %r8,%r12 - movq %r12,64+8(%rcx) - adcxq %r14,%r11 - - mulxq 32+16(%rsi),%r8,%r14 - adoxq %r9,%r11 - adcxq %r8,%r13 - adcxq %rax,%r14 - adoxq %r10,%r13 - - movq 32+16(%rdi),%rdx - mulxq 32+0(%rsi),%r8,%r9 - adoxq %rax,%r14 - xorq %rax,%rax - - mulxq 32+8(%rsi),%r10,%r12 - adoxq %r11,%r8 - movq %r8,64+16(%rcx) - adcxq %r13,%r9 - - mulxq 32+16(%rsi),%r11,%r8 - adcxq %r14,%r12 - adcxq %rax,%r8 - adoxq %r10,%r9 - adoxq %r12,%r11 - adoxq %rax,%r8 - movq %r9,64+24(%rcx) - movq %r11,64+32(%rcx) - movq %r8,64+40(%rcx) - - - - - movq 64(%rsp),%r8 - movq 72(%rsp),%r9 - movq 80(%rsp),%r10 - movq 88(%rsp),%r11 - - movq 32(%rsp),%rax - addq %rax,%r8 - movq 40(%rsp),%rax - adcq %rax,%r9 - movq 48(%rsp),%rax - adcq %rax,%r10 - movq 56(%rsp),%rax - adcq %rax,%r11 - - - movq 0(%rsp),%r12 - movq 8(%rsp),%r13 - movq 16(%rsp),%r14 - movq 24(%rsp),%r15 - subq 0(%rcx),%r12 - sbbq 8(%rcx),%r13 - sbbq 16(%rcx),%r14 - sbbq 24(%rcx),%r15 - sbbq 32(%rcx),%r8 - sbbq 40(%rcx),%r9 - sbbq 48(%rcx),%r10 - sbbq 56(%rcx),%r11 - - - subq 64(%rcx),%r12 - sbbq 72(%rcx),%r13 - sbbq 80(%rcx),%r14 - sbbq 88(%rcx),%r15 - sbbq 96(%rcx),%r8 - sbbq 104(%rcx),%r9 - sbbq $0x0,%r10 - sbbq $0x0,%r11 - - addq 32(%rcx),%r12 - movq %r12,32(%rcx) - adcq 40(%rcx),%r13 - movq %r13,40(%rcx) - adcq 48(%rcx),%r14 - movq %r14,48(%rcx) - adcq 56(%rcx),%r15 - movq %r15,56(%rcx) - adcq 64(%rcx),%r8 - movq %r8,64(%rcx) - adcq 72(%rcx),%r9 - movq %r9,72(%rcx) - adcq 80(%rcx),%r10 - movq %r10,80(%rcx) - adcq 88(%rcx),%r11 - movq %r11,88(%rcx) - movq 96(%rcx),%r12 - adcq $0x0,%r12 - movq %r12,96(%rcx) - movq 104(%rcx),%r13 - adcq $0x0,%r13 - movq %r13,104(%rcx) - - addq $96,%rsp -.cfi_adjust_cfa_offset -96 - popq %rbp -.cfi_adjust_cfa_offset -8 -.cfi_same_value rbp - popq %rbx -.cfi_adjust_cfa_offset -8 -.cfi_same_value rbx - - - popq %r15 -.cfi_adjust_cfa_offset -8 - popq %r14 -.cfi_adjust_cfa_offset -8 - popq %r13 -.cfi_adjust_cfa_offset -8 - popq %r12 -.cfi_adjust_cfa_offset -8 - .byte 0xf3,0xc3 -.cfi_endproc -#endif diff --git a/src/kem/sike/sike-p434-sha256/asm/fp_generic.c b/src/kem/sike/sike-p434-sha256/asm/fp_generic.c deleted file mode 100644 index 38e7645e..00000000 --- a/src/kem/sike/sike-p434-sha256/asm/fp_generic.c +++ /dev/null @@ -1,179 +0,0 @@ -/******************************************************************************************** -* SIDH: an efficient supersingular isogeny cryptography library -* -* Abstract: portable modular arithmetic for P503 -*********************************************************************************************/ - -#if defined(ARCH_GENERIC) || \ - (!defined(ARCH_X86_64) && !defined(ARCH_AARCH64)) - -#include "../utils.h" -#include "../fpx.h" - -// Global constants -extern const struct params_t params; - -static void digit_x_digit(const crypto_word_t a, const crypto_word_t b, crypto_word_t* c) -{ // Digit multiplication, digit * digit -> 2-digit result - crypto_word_t al, ah, bl, bh, temp; - crypto_word_t albl, albh, ahbl, ahbh, res1, res2, res3, carry; - crypto_word_t mask_low = (crypto_word_t)(-1) >> (sizeof(crypto_word_t)*4); - crypto_word_t mask_high = (crypto_word_t)(-1) << (sizeof(crypto_word_t)*4); - - al = a & mask_low; // Low part - ah = a >> (sizeof(crypto_word_t) * 4); // High part - bl = b & mask_low; - bh = b >> (sizeof(crypto_word_t) * 4); - - albl = al*bl; - albh = al*bh; - ahbl = ah*bl; - ahbh = ah*bh; - c[0] = albl & mask_low; // C00 - - res1 = albl >> (sizeof(crypto_word_t) * 4); - res2 = ahbl & mask_low; - res3 = albh & mask_low; - temp = res1 + res2 + res3; - carry = temp >> (sizeof(crypto_word_t) * 4); - c[0] ^= temp << (sizeof(crypto_word_t) * 4); // C01 - - res1 = ahbl >> (sizeof(crypto_word_t) * 4); - res2 = albh >> (sizeof(crypto_word_t) * 4); - res3 = ahbh & mask_low; - temp = res1 + res2 + res3 + carry; - c[1] = temp & mask_low; // C10 - carry = temp & mask_high; - c[1] ^= (ahbh & mask_high) + carry; // C11 -} - -void sike_fpadd(const felm_t a, const felm_t b, felm_t c) -{ // Modular addition, c = a+b mod p434. - // Inputs: a, b in [0, 2*p434-1] - // Output: c in [0, 2*p434-1] - unsigned int i, carry = 0; - crypto_word_t mask; - - for (i = 0; i < NWORDS_FIELD; i++) { - ADDC(carry, a[i], b[i], carry, c[i]); - } - - carry = 0; - for (i = 0; i < NWORDS_FIELD; i++) { - SUBC(carry, c[i], params.prime_x2[i], carry, c[i]); - } - mask = 0 - (crypto_word_t)carry; - - carry = 0; - for (i = 0; i < NWORDS_FIELD; i++) { - ADDC(carry, c[i], params.prime_x2[i] & mask, carry, c[i]); - } -} - -void sike_fpsub(const felm_t a, const felm_t b, felm_t c) -{ // Modular subtraction, c = a-b mod p434. - // Inputs: a, b in [0, 2*p434-1] - // Output: c in [0, 2*p434-1] - unsigned int i, borrow = 0; - crypto_word_t mask; - - for (i = 0; i < NWORDS_FIELD; i++) { - SUBC(borrow, a[i], b[i], borrow, c[i]); - } - mask = 0 - (crypto_word_t)borrow; - - borrow = 0; - for (i = 0; i < NWORDS_FIELD; i++) { - ADDC(borrow, c[i], params.prime_x2[i] & mask, borrow, c[i]); - } -} - -void sike_mpmul(const felm_t a, const felm_t b, dfelm_t c) -{ // Multiprecision comba multiply, c = a*b, where lng(a) = lng(b) = NWORDS_FIELD. - unsigned int i, j; - crypto_word_t t = 0, u = 0, v = 0, UV[2]; - unsigned int carry = 0; - - for (i = 0; i < NWORDS_FIELD; i++) { - for (j = 0; j <= i; j++) { - MUL(a[j], b[i-j], UV+1, UV[0]); - ADDC(0, UV[0], v, carry, v); - ADDC(carry, UV[1], u, carry, u); - t += carry; - } - c[i] = v; - v = u; - u = t; - t = 0; - } - - for (i = NWORDS_FIELD; i < 2*NWORDS_FIELD-1; i++) { - for (j = i-NWORDS_FIELD+1; j < NWORDS_FIELD; j++) { - MUL(a[j], b[i-j], UV+1, UV[0]); - ADDC(0, UV[0], v, carry, v); - ADDC(carry, UV[1], u, carry, u); - t += carry; - } - c[i] = v; - v = u; - u = t; - t = 0; - } - c[2*NWORDS_FIELD-1] = v; -} - -void sike_fprdc(const felm_t ma, felm_t mc) -{ // Efficient Montgomery reduction using comba and exploiting the special form of the prime p434. - // mc = ma*R^-1 mod p434x2, where R = 2^448. - // If ma < 2^448*p434, the output mc is in the range [0, 2*p434-1]. - // ma is assumed to be in Montgomery representation. - unsigned int i, j, carry, count = ZERO_WORDS; - crypto_word_t UV[2], t = 0, u = 0, v = 0; - - for (i = 0; i < NWORDS_FIELD; i++) { - mc[i] = 0; - } - - for (i = 0; i < NWORDS_FIELD; i++) { - for (j = 0; j < i; j++) { - if (j < (i-ZERO_WORDS+1)) { - MUL(mc[j], params.prime_p1[i-j], UV+1, UV[0]); - ADDC(0, UV[0], v, carry, v); - ADDC(carry, UV[1], u, carry, u); - t += carry; - } - } - ADDC(0, v, ma[i], carry, v); - ADDC(carry, u, 0, carry, u); - t += carry; - mc[i] = v; - v = u; - u = t; - t = 0; - } - - for (i = NWORDS_FIELD; i < 2*NWORDS_FIELD-1; i++) { - if (count > 0) { - count -= 1; - } - for (j = i-NWORDS_FIELD+1; j < NWORDS_FIELD; j++) { - if (j < (NWORDS_FIELD-count)) { - MUL(mc[j], params.prime_p1[i-j], UV+1, UV[0]); - ADDC(0, UV[0], v, carry, v); - ADDC(carry, UV[1], u, carry, u); - t += carry; - } - } - ADDC(0, v, ma[i], carry, v); - ADDC(carry, u, 0, carry, u); - t += carry; - mc[i-NWORDS_FIELD] = v; - v = u; - u = t; - t = 0; - } - ADDC(0, v, ma[2*NWORDS_FIELD-1], carry, v); - mc[NWORDS_FIELD-1] = v; -} - -#endif // NO_ASM || (!X86_64 && !AARCH64) diff --git a/src/kem/sike/sike-p434-sha256/fpx.c b/src/kem/sike/sike-p434-sha256/fpx.c deleted file mode 100644 index 30233406..00000000 --- a/src/kem/sike/sike-p434-sha256/fpx.c +++ /dev/null @@ -1,282 +0,0 @@ -/******************************************************************************************** -* SIDH: an efficient supersingular isogeny cryptography library -* -* Abstract: core functions over GF(p) and GF(p^2) -*********************************************************************************************/ -#include -#include "utils.h" -#include "fpx.h" - -extern const struct params_t params; - -// Multiprecision squaring, c = a^2 mod p. -static void fpsqr_mont(const felm_t ma, felm_t mc) -{ - dfelm_t temp = {0}; - sike_mpmul(ma, ma, temp); - sike_fprdc(temp, mc); -} - -// Chain to compute a^(p-3)/4 using Montgomery arithmetic. -static void fpinv_chain_mont(felm_t a) -{ - unsigned int i, j; - felm_t t[31], tt; - - // Precomputed table - fpsqr_mont(a, tt); - sike_fpmul_mont(a, tt, t[0]); - for (i = 0; i <= 29; i++) sike_fpmul_mont(t[i], tt, t[i+1]); - - sike_fpcopy(a, tt); - for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); - sike_fpmul_mont(t[5], tt, tt); - for (i = 0; i < 10; i++) fpsqr_mont(tt, tt); - sike_fpmul_mont(t[14], tt, tt); - for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); - sike_fpmul_mont(t[3], tt, tt); - for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); - sike_fpmul_mont(t[23], tt, tt); - for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); - sike_fpmul_mont(t[13], tt, tt); - for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); - sike_fpmul_mont(t[24], tt, tt); - for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); - sike_fpmul_mont(t[7], tt, tt); - for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); - sike_fpmul_mont(t[12], tt, tt); - for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); - sike_fpmul_mont(t[30], tt, tt); - for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); - sike_fpmul_mont(t[1], tt, tt); - for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); - sike_fpmul_mont(t[30], tt, tt); - for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); - sike_fpmul_mont(t[21], tt, tt); - for (i = 0; i < 9; i++) fpsqr_mont(tt, tt); - sike_fpmul_mont(t[2], tt, tt); - for (i = 0; i < 9; i++) fpsqr_mont(tt, tt); - sike_fpmul_mont(t[19], tt, tt); - for (i = 0; i < 9; i++) fpsqr_mont(tt, tt); - sike_fpmul_mont(t[1], tt, tt); - for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); - sike_fpmul_mont(t[24], tt, tt); - for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); - sike_fpmul_mont(t[26], tt, tt); - for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); - sike_fpmul_mont(t[16], tt, tt); - for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); - sike_fpmul_mont(t[10], tt, tt); - for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); - sike_fpmul_mont(t[6], tt, tt); - for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); - sike_fpmul_mont(t[0], tt, tt); - for (i = 0; i < 9; i++) fpsqr_mont(tt, tt); - sike_fpmul_mont(t[20], tt, tt); - for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); - sike_fpmul_mont(t[9], tt, tt); - for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); - sike_fpmul_mont(t[25], tt, tt); - for (i = 0; i < 9; i++) fpsqr_mont(tt, tt); - sike_fpmul_mont(t[30], tt, tt); - for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); - sike_fpmul_mont(t[26], tt, tt); - for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); - sike_fpmul_mont(a, tt, tt); - for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); - sike_fpmul_mont(t[28], tt, tt); - for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); - sike_fpmul_mont(t[6], tt, tt); - for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); - sike_fpmul_mont(t[10], tt, tt); - for (i = 0; i < 9; i++) fpsqr_mont(tt, tt); - sike_fpmul_mont(t[22], tt, tt); - for (j = 0; j < 35; j++) { - for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); - sike_fpmul_mont(t[30], tt, tt); - } - sike_fpcopy(tt, a); -} - -// Field inversion using Montgomery arithmetic, a = a^(-1)*R mod p. -static void fpinv_mont(felm_t a) -{ - felm_t tt = {0}; - sike_fpcopy(a, tt); - fpinv_chain_mont(tt); - fpsqr_mont(tt, tt); - fpsqr_mont(tt, tt); - sike_fpmul_mont(a, tt, a); -} - -// Multiprecision addition, c = a+b, where lng(a) = lng(b) = nwords. Returns the carry bit. -#if defined(ARCH_GENERIC) || (!defined(ARCH_X86_64) && !defined(ARCH_AARCH64)) -inline static unsigned int mp_add(const felm_t a, const felm_t b, felm_t c, const unsigned int nwords) { - uint8_t carry = 0; - for (size_t i = 0; i < nwords; i++) { - ADDC(carry, a[i], b[i], carry, c[i]); - } - return carry; -} - -// Multiprecision subtraction, c = a-b, where lng(a) = lng(b) = nwords. Returns the borrow bit. -inline static unsigned int mp_sub(const felm_t a, const felm_t b, felm_t c, const unsigned int nwords) { - uint32_t borrow = 0; - for (size_t i = 0; i < nwords; i++) { - SUBC(borrow, a[i], b[i], borrow, c[i]); - } - return borrow; -} -#endif - -// Multiprecision addition, c = a+b. -inline static void mp_addfast(const felm_t a, const felm_t b, felm_t c) -{ -#if defined(ARCH_GENERIC) || (!defined(ARCH_X86_64) && !defined(ARCH_AARCH64)) - mp_add(a, b, c, NWORDS_FIELD); -#else - sike_mpadd_asm(a, b, c); -#endif -} - -// Multiprecision subtraction, c = a-b, where lng(a) = lng(b) = 2*NWORDS_FIELD. -// If c < 0 then returns mask = 0xFF..F, else mask = 0x00..0 -inline static crypto_word_t mp_subfast(const dfelm_t a, const dfelm_t b, dfelm_t c) { -#if defined(ARCH_GENERIC) || (!defined(ARCH_X86_64) && !defined(ARCH_AARCH64)) - return (0 - (crypto_word_t)mp_sub(a, b, c, 2*NWORDS_FIELD)); -#else - return sike_mpsubx2_asm(a, b, c); -#endif -} - -// Multiprecision subtraction, c = c-a-b, where lng(a) = lng(b) = 2*NWORDS_FIELD. -// Inputs should be s.t. c > a and c > b -inline static void mp_dblsubfast(const dfelm_t a, const dfelm_t b, dfelm_t c) { -#if defined(ARCH_GENERIC) || (!defined(ARCH_X86_64) && !defined(ARCH_AARCH64)) - mp_sub(c, a, c, 2*NWORDS_FIELD); - mp_sub(c, b, c, 2*NWORDS_FIELD); -#else - sike_mpdblsubx2_asm(a, b, c); -#endif -} - -// Copy a field element, c = a. -void sike_fpcopy(const felm_t a, felm_t c) { - for (size_t i = 0; i < NWORDS_FIELD; i++) { - c[i] = a[i]; - } -} - -// Field multiplication using Montgomery arithmetic, c = a*b*R^-1 mod prime, where R=2^768 -void sike_fpmul_mont(const felm_t ma, const felm_t mb, felm_t mc) -{ - dfelm_t temp = {0}; - sike_mpmul(ma, mb, temp); - sike_fprdc(temp, mc); -} - -// Conversion from Montgomery representation to standard representation, -// c = ma*R^(-1) mod p = a mod p, where ma in [0, p-1]. -void sike_from_mont(const felm_t ma, felm_t c) -{ - felm_t one = {0}; - one[0] = 1; - - sike_fpmul_mont(ma, one, c); - sike_fpcorrection(c); -} - -// GF(p^2) squaring using Montgomery arithmetic, c = a^2 in GF(p^2). -// Inputs: a = a0+a1*i, where a0, a1 are in [0, 2*p-1] -// Output: c = c0+c1*i, where c0, c1 are in [0, 2*p-1] -void sike_fp2sqr_mont(const f2elm_t a, f2elm_t c) { - felm_t t1, t2, t3; - - mp_addfast(a->c0, a->c1, t1); // t1 = a0+a1 - sike_fpsub(a->c0, a->c1, t2); // t2 = a0-a1 - mp_addfast(a->c0, a->c0, t3); // t3 = 2a0 - sike_fpmul_mont(t1, t2, c->c0); // c0 = (a0+a1)(a0-a1) - sike_fpmul_mont(t3, a->c1, c->c1); // c1 = 2a0*a1 -} - -// Modular negation, a = -a mod p503. -// Input/output: a in [0, 2*p503-1] -void sike_fpneg(felm_t a) { - uint32_t borrow = 0; - for (size_t i = 0; i < NWORDS_FIELD; i++) { - SUBC(borrow, params.prime_x2[i], a[i], borrow, a[i]); - } -} - -// Modular division by two, c = a/2 mod p503. -// Input : a in [0, 2*p503-1] -// Output: c in [0, 2*p503-1] -void sike_fpdiv2(const felm_t a, felm_t c) { - uint32_t carry = 0; - crypto_word_t mask; - - mask = 0 - (crypto_word_t)(a[0] & 1); // If a is odd compute a+p503 - for (size_t i = 0; i < NWORDS_FIELD; i++) { - ADDC(carry, a[i], params.prime[i] & mask, carry, c[i]); - } - - // Multiprecision right shift by one. - for (size_t i = 0; i < NWORDS_FIELD-1; i++) { - c[i] = (c[i] >> 1) ^ (c[i+1] << (RADIX - 1)); - } - c[NWORDS_FIELD-1] >>= 1; -} - -// Modular correction to reduce field element a in [0, 2*p503-1] to [0, p503-1]. -void sike_fpcorrection(felm_t a) { - uint32_t borrow = 0; - crypto_word_t mask; - - for (size_t i = 0; i < NWORDS_FIELD; i++) { - SUBC(borrow, a[i], params.prime[i], borrow, a[i]); - } - mask = 0 - (crypto_word_t)borrow; - - borrow = 0; - for (size_t i = 0; i < NWORDS_FIELD; i++) { - ADDC(borrow, a[i], params.prime[i] & mask, borrow, a[i]); - } -} - -// GF(p^2) multiplication using Montgomery arithmetic, c = a*b in GF(p^2). -// Inputs: a = a0+a1*i and b = b0+b1*i, where a0, a1, b0, b1 are in [0, 2*p-1] -// Output: c = c0+c1*i, where c0, c1 are in [0, 2*p-1] -void sike_fp2mul_mont(const f2elm_t a, const f2elm_t b, f2elm_t c) { - felm_t t1, t2; - dfelm_t tt1, tt2, tt3; - crypto_word_t mask; - - mp_addfast(a->c0, a->c1, t1); // t1 = a0+a1 - mp_addfast(b->c0, b->c1, t2); // t2 = b0+b1 - sike_mpmul(a->c0, b->c0, tt1); // tt1 = a0*b0 - sike_mpmul(a->c1, b->c1, tt2); // tt2 = a1*b1 - sike_mpmul(t1, t2, tt3); // tt3 = (a0+a1)*(b0+b1) - mp_dblsubfast(tt1, tt2, tt3); // tt3 = (a0+a1)*(b0+b1) - a0*b0 - a1*b1 - mask = mp_subfast(tt1, tt2, tt1); // tt1 = a0*b0 - a1*b1. If tt1 < 0 then mask = 0xFF..F, else if tt1 >= 0 then mask = 0x00..0 - - for (size_t i = 0; i < NWORDS_FIELD; i++) { - t1[i] = params.prime[i] & mask; - } - - sike_fprdc(tt3, c->c1); // c[1] = (a0+a1)*(b0+b1) - a0*b0 - a1*b1 - mp_addfast(&tt1[NWORDS_FIELD], t1, &tt1[NWORDS_FIELD]); - sike_fprdc(tt1, c->c0); // c[0] = a0*b0 - a1*b1 -} - -// GF(p^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2). -void sike_fp2inv_mont(f2elm_t a) { - f2elm_t t1; - - fpsqr_mont(a->c0, t1->c0); // t10 = a0^2 - fpsqr_mont(a->c1, t1->c1); // t11 = a1^2 - sike_fpadd(t1->c0, t1->c1, t1->c0); // t10 = a0^2+a1^2 - fpinv_mont(t1->c0); // t10 = (a0^2+a1^2)^-1 - sike_fpneg(a->c1); // a = a0-i*a1 - sike_fpmul_mont(a->c0, t1->c0, a->c0); - sike_fpmul_mont(a->c1, t1->c0, a->c1); // a = (a0-i*a1)*(a0^2+a1^2)^-1 -} diff --git a/src/kem/sike/sike-p434-sha256/fpx.h b/src/kem/sike/sike-p434-sha256/fpx.h deleted file mode 100644 index b9255ac7..00000000 --- a/src/kem/sike/sike-p434-sha256/fpx.h +++ /dev/null @@ -1,112 +0,0 @@ -#ifndef FPX_H_ -#define FPX_H_ - -#include "utils.h" - -#if defined(__cplusplus) -extern "C" { -#endif - -// Modular addition, c = a+b mod p. -void sike_fpadd(const felm_t a, const felm_t b, felm_t c); -// Modular subtraction, c = a-b mod p. -void sike_fpsub(const felm_t a, const felm_t b, felm_t c); -// Modular division by two, c = a/2 mod p. -void sike_fpdiv2(const felm_t a, felm_t c); -// Modular correction to reduce field element a in [0, 2*p-1] to [0, p-1]. -void sike_fpcorrection(felm_t a); -// Multiprecision multiply, c = a*b, where lng(a) = lng(b) = nwords. -void sike_mpmul(const felm_t a, const felm_t b, dfelm_t c); -// 443-bit Montgomery reduction, c = a mod p -void sike_fprdc(const dfelm_t a, felm_t c); -// Double 2x443-bit multiprecision subtraction, c = c-a-b -void sike_mpdblsubx2_asm(const felm_t a, const felm_t b, felm_t c); -// Multiprecision subtraction, c = a-b -crypto_word_t sike_mpsubx2_asm(const dfelm_t a, const dfelm_t b, dfelm_t c); -// 443-bit multiprecision addition, c = a+b -void sike_mpadd_asm(const felm_t a, const felm_t b, felm_t c); -// Modular negation, a = -a mod p. -void sike_fpneg(felm_t a); -// Copy of a field element, c = a -void sike_fpcopy(const felm_t a, felm_t c); -// Copy a field element, c = a. -void sike_fpzero(felm_t a); -// If option = 0xFF...FF x=y; y=x, otherwise swap doesn't happen. Constant time. -void sike_cswap_asm(point_proj_t x, point_proj_t y, const crypto_word_t option); -// Conversion from Montgomery representation to standard representation, -// c = ma*R^(-1) mod p = a mod p, where ma in [0, p-1]. -void sike_from_mont(const felm_t ma, felm_t c); -// Field multiplication using Montgomery arithmetic, c = a*b*R^-1 mod p443, where R=2^768 -void sike_fpmul_mont(const felm_t ma, const felm_t mb, felm_t mc); -// GF(p443^2) multiplication using Montgomery arithmetic, c = a*b in GF(p443^2) -void sike_fp2mul_mont(const f2elm_t a, const f2elm_t b, f2elm_t c); -// GF(p443^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2) -void sike_fp2inv_mont(f2elm_t a); -// GF(p^2) squaring using Montgomery arithmetic, c = a^2 in GF(p^2). -void sike_fp2sqr_mont(const f2elm_t a, f2elm_t c); -// Modular correction, a = a in GF(p^2). -void sike_fp2correction(f2elm_t a); - -#if defined(__cplusplus) -} // extern C -#endif - -// GF(p^2) addition, c = a+b in GF(p^2). -#define sike_fp2add(a, b, c) \ -do { \ - sike_fpadd(a->c0, b->c0, c->c0); \ - sike_fpadd(a->c1, b->c1, c->c1); \ -} while(0) - -// GF(p^2) subtraction, c = a-b in GF(p^2). -#define sike_fp2sub(a,b,c) \ -do { \ - sike_fpsub(a->c0, b->c0, c->c0); \ - sike_fpsub(a->c1, b->c1, c->c1); \ -} while(0) - -// Copy a GF(p^2) element, c = a. -#define sike_fp2copy(a, c) \ -do { \ - sike_fpcopy(a->c0, c->c0); \ - sike_fpcopy(a->c1, c->c1); \ -} while(0) - -// GF(p^2) negation, a = -a in GF(p^2). -#define sike_fp2neg(a) \ -do { \ - sike_fpneg(a->c0); \ - sike_fpneg(a->c1); \ -} while(0) - -// GF(p^2) division by two, c = a/2 in GF(p^2). -#define sike_fp2div2(a, c) \ -do { \ - sike_fpdiv2(a->c0, c->c0); \ - sike_fpdiv2(a->c1, c->c1); \ -} while(0) - -// Modular correction, a = a in GF(p^2). -#define sike_fp2correction(a) \ -do { \ - sike_fpcorrection(a->c0); \ - sike_fpcorrection(a->c1); \ -} while(0) - -// Conversion of a GF(p^2) element to Montgomery representation, -// mc_i = a_i*R^2*R^(-1) = a_i*R in GF(p^2). -#define sike_to_fp2mont(a, mc) \ -do { \ - sike_fpmul_mont(a->c0, params.mont_R2, mc->c0); \ - sike_fpmul_mont(a->c1, params.mont_R2, mc->c1); \ -} while(0) - -// Conversion of a GF(p^2) element from Montgomery representation to standard representation, -// c_i = ma_i*R^(-1) = a_i in GF(p^2). -#define sike_from_fp2mont(ma, c) \ -do { \ - sike_from_mont(ma->c0, c->c0); \ - sike_from_mont(ma->c1, c->c1); \ -} while(0) - -#endif // FPX_H_ diff --git a/src/kem/sike/sike-p434-sha256/isogeny.c b/src/kem/sike/sike-p434-sha256/isogeny.c deleted file mode 100644 index 661410e4..00000000 --- a/src/kem/sike/sike-p434-sha256/isogeny.c +++ /dev/null @@ -1,262 +0,0 @@ -/******************************************************************************************** -* SIDH: an efficient supersingular isogeny cryptography library -* -* Abstract: elliptic curve and isogeny functions -*********************************************************************************************/ -#include -#include -#include "utils.h" -#include "isogeny.h" -#include "fpx.h" - -static void xDBL(const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, const f2elm_t C24) -{ // Doubling of a Montgomery point in projective coordinates (X:Z). - // Input: projective Montgomery x-coordinates P = (X1:Z1), where x1=X1/Z1 and Montgomery curve constants A+2C and 4C. - // Output: projective Montgomery x-coordinates Q = 2*P = (X2:Z2). - f2elm_t t0, t1; - - sike_fp2sub(P->X, P->Z, t0); // t0 = X1-Z1 - sike_fp2add(P->X, P->Z, t1); // t1 = X1+Z1 - sike_fp2sqr_mont(t0, t0); // t0 = (X1-Z1)^2 - sike_fp2sqr_mont(t1, t1); // t1 = (X1+Z1)^2 - sike_fp2mul_mont(C24, t0, Q->Z); // Z2 = C24*(X1-Z1)^2 - sike_fp2mul_mont(t1, Q->Z, Q->X); // X2 = C24*(X1-Z1)^2*(X1+Z1)^2 - sike_fp2sub(t1, t0, t1); // t1 = (X1+Z1)^2-(X1-Z1)^2 - sike_fp2mul_mont(A24plus, t1, t0); // t0 = A24plus*[(X1+Z1)^2-(X1-Z1)^2] - sike_fp2add(Q->Z, t0, Q->Z); // Z2 = A24plus*[(X1+Z1)^2-(X1-Z1)^2] + C24*(X1-Z1)^2 - sike_fp2mul_mont(Q->Z, t1, Q->Z); // Z2 = [A24plus*[(X1+Z1)^2-(X1-Z1)^2] + C24*(X1-Z1)^2]*[(X1+Z1)^2-(X1-Z1)^2] -} - -void xDBLe(const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, const f2elm_t C24, size_t e) -{ // Computes [2^e](X:Z) on Montgomery curve with projective constant via e repeated doublings. - // Input: projective Montgomery x-coordinates P = (XP:ZP), such that xP=XP/ZP and Montgomery curve constants A+2C and 4C. - // Output: projective Montgomery x-coordinates Q <- (2^e)*P. - - memmove(Q, P, sizeof(*P)); - for (size_t i = 0; i < e; i++) { - xDBL(Q, Q, A24plus, C24); - } -} - -void get_4_isog(const point_proj_t P, f2elm_t A24plus, f2elm_t C24, f2elm_t* coeff) -{ // Computes the corresponding 4-isogeny of a projective Montgomery point (X4:Z4) of order 4. - // Input: projective point of order four P = (X4:Z4). - // Output: the 4-isogenous Montgomery curve with projective coefficients A+2C/4C and the 3 coefficients - // that are used to evaluate the isogeny at a point in eval_4_isog(). - - sike_fp2sub(P->X, P->Z, coeff[1]); // coeff[1] = X4-Z4 - sike_fp2add(P->X, P->Z, coeff[2]); // coeff[2] = X4+Z4 - sike_fp2sqr_mont(P->Z, coeff[0]); // coeff[0] = Z4^2 - sike_fp2add(coeff[0], coeff[0], coeff[0]); // coeff[0] = 2*Z4^2 - sike_fp2sqr_mont(coeff[0], C24); // C24 = 4*Z4^4 - sike_fp2add(coeff[0], coeff[0], coeff[0]); // coeff[0] = 4*Z4^2 - sike_fp2sqr_mont(P->X, A24plus); // A24plus = X4^2 - sike_fp2add(A24plus, A24plus, A24plus); // A24plus = 2*X4^2 - sike_fp2sqr_mont(A24plus, A24plus); // A24plus = 4*X4^4 -} - -void eval_4_isog(point_proj_t P, f2elm_t* coeff) -{ // Evaluates the isogeny at the point (X:Z) in the domain of the isogeny, given a 4-isogeny phi defined - // by the 3 coefficients in coeff (computed in the function get_4_isog()). - // Inputs: the coefficients defining the isogeny, and the projective point P = (X:Z). - // Output: the projective point P = phi(P) = (X:Z) in the codomain. - f2elm_t t0, t1; - - sike_fp2add(P->X, P->Z, t0); // t0 = X+Z - sike_fp2sub(P->X, P->Z, t1); // t1 = X-Z - sike_fp2mul_mont(t0, coeff[1], P->X); // X = (X+Z)*coeff[1] - sike_fp2mul_mont(t1, coeff[2], P->Z); // Z = (X-Z)*coeff[2] - sike_fp2mul_mont(t0, t1, t0); // t0 = (X+Z)*(X-Z) - sike_fp2mul_mont(t0, coeff[0], t0); // t0 = coeff[0]*(X+Z)*(X-Z) - sike_fp2add(P->X, P->Z, t1); // t1 = (X-Z)*coeff[2] + (X+Z)*coeff[1] - sike_fp2sub(P->X, P->Z, P->Z); // Z = (X-Z)*coeff[2] - (X+Z)*coeff[1] - sike_fp2sqr_mont(t1, t1); // t1 = [(X-Z)*coeff[2] + (X+Z)*coeff[1]]^2 - sike_fp2sqr_mont(P->Z, P->Z); // Z = [(X-Z)*coeff[2] - (X+Z)*coeff[1]]^2 - sike_fp2add(t1, t0, P->X); // X = coeff[0]*(X+Z)*(X-Z) + [(X-Z)*coeff[2] + (X+Z)*coeff[1]]^2 - sike_fp2sub(P->Z, t0, t0); // t0 = [(X-Z)*coeff[2] - (X+Z)*coeff[1]]^2 - coeff[0]*(X+Z)*(X-Z) - sike_fp2mul_mont(P->X, t1, P->X); // Xfinal - sike_fp2mul_mont(P->Z, t0, P->Z); // Zfinal -} - - -void xTPL(const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, const f2elm_t A24plus) -{ // Tripling of a Montgomery point in projective coordinates (X:Z). - // Input: projective Montgomery x-coordinates P = (X:Z), where x=X/Z and Montgomery curve constants A24plus = A+2C and A24minus = A-2C. - // Output: projective Montgomery x-coordinates Q = 3*P = (X3:Z3). - f2elm_t t0, t1, t2, t3, t4, t5, t6; - - sike_fp2sub(P->X, P->Z, t0); // t0 = X-Z - sike_fp2sqr_mont(t0, t2); // t2 = (X-Z)^2 - sike_fp2add(P->X, P->Z, t1); // t1 = X+Z - sike_fp2sqr_mont(t1, t3); // t3 = (X+Z)^2 - sike_fp2add(t0, t1, t4); // t4 = 2*X - sike_fp2sub(t1, t0, t0); // t0 = 2*Z - sike_fp2sqr_mont(t4, t1); // t1 = 4*X^2 - sike_fp2sub(t1, t3, t1); // t1 = 4*X^2 - (X+Z)^2 - sike_fp2sub(t1, t2, t1); // t1 = 4*X^2 - (X+Z)^2 - (X-Z)^2 - sike_fp2mul_mont(t3, A24plus, t5); // t5 = A24plus*(X+Z)^2 - sike_fp2mul_mont(t3, t5, t3); // t3 = A24plus*(X+Z)^3 - sike_fp2mul_mont(A24minus, t2, t6); // t6 = A24minus*(X-Z)^2 - sike_fp2mul_mont(t2, t6, t2); // t2 = A24minus*(X-Z)^3 - sike_fp2sub(t2, t3, t3); // t3 = A24minus*(X-Z)^3 - coeff*(X+Z)^3 - sike_fp2sub(t5, t6, t2); // t2 = A24plus*(X+Z)^2 - A24minus*(X-Z)^2 - sike_fp2mul_mont(t1, t2, t1); // t1 = [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] - sike_fp2add(t3, t1, t2); // t2 = [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] + A24minus*(X-Z)^3 - coeff*(X+Z)^3 - sike_fp2sqr_mont(t2, t2); // t2 = t2^2 - sike_fp2mul_mont(t4, t2, Q->X); // X3 = 2*X*t2 - sike_fp2sub(t3, t1, t1); // t1 = A24minus*(X-Z)^3 - A24plus*(X+Z)^3 - [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] - sike_fp2sqr_mont(t1, t1); // t1 = t1^2 - sike_fp2mul_mont(t0, t1, Q->Z); // Z3 = 2*Z*t1 -} - -void xTPLe(const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, const f2elm_t A24plus, size_t e) -{ // Computes [3^e](X:Z) on Montgomery curve with projective constant via e repeated triplings. - // Input: projective Montgomery x-coordinates P = (XP:ZP), such that xP=XP/ZP and Montgomery curve constants A24plus = A+2C and A24minus = A-2C. - // Output: projective Montgomery x-coordinates Q <- (3^e)*P. - memmove(Q, P, sizeof(*P)); - for (size_t i = 0; i < e; i++) { - xTPL(Q, Q, A24minus, A24plus); - } -} - -void get_3_isog(const point_proj_t P, f2elm_t A24minus, f2elm_t A24plus, f2elm_t* coeff) -{ // Computes the corresponding 3-isogeny of a projective Montgomery point (X3:Z3) of order 3. - // Input: projective point of order three P = (X3:Z3). - // Output: the 3-isogenous Montgomery curve with projective coefficient A/C. - f2elm_t t0, t1, t2, t3, t4; - - sike_fp2sub(P->X, P->Z, coeff[0]); // coeff0 = X-Z - sike_fp2sqr_mont(coeff[0], t0); // t0 = (X-Z)^2 - sike_fp2add(P->X, P->Z, coeff[1]); // coeff1 = X+Z - sike_fp2sqr_mont(coeff[1], t1); // t1 = (X+Z)^2 - sike_fp2add(t0, t1, t2); // t2 = (X+Z)^2 + (X-Z)^2 - sike_fp2add(coeff[0], coeff[1], t3); // t3 = 2*X - sike_fp2sqr_mont(t3, t3); // t3 = 4*X^2 - sike_fp2sub(t3, t2, t3); // t3 = 4*X^2 - (X+Z)^2 - (X-Z)^2 - sike_fp2add(t1, t3, t2); // t2 = 4*X^2 - (X-Z)^2 - sike_fp2add(t3, t0, t3); // t3 = 4*X^2 - (X+Z)^2 - sike_fp2add(t0, t3, t4); // t4 = 4*X^2 - (X+Z)^2 + (X-Z)^2 - sike_fp2add(t4, t4, t4); // t4 = 2(4*X^2 - (X+Z)^2 + (X-Z)^2) - sike_fp2add(t1, t4, t4); // t4 = 8*X^2 - (X+Z)^2 + 2*(X-Z)^2 - sike_fp2mul_mont(t2, t4, A24minus); // A24minus = [4*X^2 - (X-Z)^2]*[8*X^2 - (X+Z)^2 + 2*(X-Z)^2] - sike_fp2add(t1, t2, t4); // t4 = 4*X^2 + (X+Z)^2 - (X-Z)^2 - sike_fp2add(t4, t4, t4); // t4 = 2(4*X^2 + (X+Z)^2 - (X-Z)^2) - sike_fp2add(t0, t4, t4); // t4 = 8*X^2 + 2*(X+Z)^2 - (X-Z)^2 - sike_fp2mul_mont(t3, t4, t4); // t4 = [4*X^2 - (X+Z)^2]*[8*X^2 + 2*(X+Z)^2 - (X-Z)^2] - sike_fp2sub(t4, A24minus, t0); // t0 = [4*X^2 - (X+Z)^2]*[8*X^2 + 2*(X+Z)^2 - (X-Z)^2] - [4*X^2 - (X-Z)^2]*[8*X^2 - (X+Z)^2 + 2*(X-Z)^2] - sike_fp2add(A24minus, t0, A24plus); // A24plus = 8*X^2 - (X+Z)^2 + 2*(X-Z)^2 -} - - -void eval_3_isog(point_proj_t Q, f2elm_t* coeff) -{ // Computes the 3-isogeny R=phi(X:Z), given projective point (X3:Z3) of order 3 on a Montgomery curve and - // a point P with 2 coefficients in coeff (computed in the function get_3_isog()). - // Inputs: projective points P = (X3:Z3) and Q = (X:Z). - // Output: the projective point Q <- phi(Q) = (X3:Z3). - f2elm_t t0, t1, t2; - - sike_fp2add(Q->X, Q->Z, t0); // t0 = X+Z - sike_fp2sub(Q->X, Q->Z, t1); // t1 = X-Z - sike_fp2mul_mont(t0, coeff[0], t0); // t0 = coeff0*(X+Z) - sike_fp2mul_mont(t1, coeff[1], t1); // t1 = coeff1*(X-Z) - sike_fp2add(t0, t1, t2); // t2 = coeff0*(X+Z) + coeff1*(X-Z) - sike_fp2sub(t1, t0, t0); // t0 = coeff1*(X-Z) - coeff0*(X+Z) - sike_fp2sqr_mont(t2, t2); // t2 = [coeff0*(X+Z) + coeff1*(X-Z)]^2 - sike_fp2sqr_mont(t0, t0); // t0 = [coeff1*(X-Z) - coeff0*(X+Z)]^2 - sike_fp2mul_mont(Q->X, t2, Q->X); // X3final = X*[coeff0*(X+Z) + coeff1*(X-Z)]^2 - sike_fp2mul_mont(Q->Z, t0, Q->Z); // Z3final = Z*[coeff1*(X-Z) - coeff0*(X+Z)]^2 -} - - -void inv_3_way(f2elm_t z1, f2elm_t z2, f2elm_t z3) -{ // 3-way simultaneous inversion - // Input: z1,z2,z3 - // Output: 1/z1,1/z2,1/z3 (override inputs). - f2elm_t t0, t1, t2, t3; - - sike_fp2mul_mont(z1, z2, t0); // t0 = z1*z2 - sike_fp2mul_mont(z3, t0, t1); // t1 = z1*z2*z3 - sike_fp2inv_mont(t1); // t1 = 1/(z1*z2*z3) - sike_fp2mul_mont(z3, t1, t2); // t2 = 1/(z1*z2) - sike_fp2mul_mont(t2, z2, t3); // t3 = 1/z1 - sike_fp2mul_mont(t2, z1, z2); // z2 = 1/z2 - sike_fp2mul_mont(t0, t1, z3); // z3 = 1/z3 - sike_fp2copy(t3, z1); // z1 = 1/z1 -} - - -void get_A(const f2elm_t xP, const f2elm_t xQ, const f2elm_t xR, f2elm_t A) -{ // Given the x-coordinates of P, Q, and R, returns the value A corresponding to the Montgomery curve E_A: y^2=x^3+A*x^2+x such that R=Q-P on E_A. - // Input: the x-coordinates xP, xQ, and xR of the points P, Q and R. - // Output: the coefficient A corresponding to the curve E_A: y^2=x^3+A*x^2+x. - f2elm_t t0, t1, one = F2ELM_INIT; - - extern const struct params_t params; - sike_fpcopy(params.mont_one, one->c0); - sike_fp2add(xP, xQ, t1); // t1 = xP+xQ - sike_fp2mul_mont(xP, xQ, t0); // t0 = xP*xQ - sike_fp2mul_mont(xR, t1, A); // A = xR*t1 - sike_fp2add(t0, A, A); // A = A+t0 - sike_fp2mul_mont(t0, xR, t0); // t0 = t0*xR - sike_fp2sub(A, one, A); // A = A-1 - sike_fp2add(t0, t0, t0); // t0 = t0+t0 - sike_fp2add(t1, xR, t1); // t1 = t1+xR - sike_fp2add(t0, t0, t0); // t0 = t0+t0 - sike_fp2sqr_mont(A, A); // A = A^2 - sike_fp2inv_mont(t0); // t0 = 1/t0 - sike_fp2mul_mont(A, t0, A); // A = A*t0 - sike_fp2sub(A, t1, A); // Afinal = A-t1 -} - - -void j_inv(const f2elm_t A, const f2elm_t C, f2elm_t jinv) -{ // Computes the j-invariant of a Montgomery curve with projective constant. - // Input: A,C in GF(p^2). - // Output: j=256*(A^2-3*C^2)^3/(C^4*(A^2-4*C^2)), which is the j-invariant of the Montgomery curve B*y^2=x^3+(A/C)*x^2+x or (equivalently) j-invariant of B'*y^2=C*x^3+A*x^2+C*x. - f2elm_t t0, t1; - - sike_fp2sqr_mont(A, jinv); // jinv = A^2 - sike_fp2sqr_mont(C, t1); // t1 = C^2 - sike_fp2add(t1, t1, t0); // t0 = t1+t1 - sike_fp2sub(jinv, t0, t0); // t0 = jinv-t0 - sike_fp2sub(t0, t1, t0); // t0 = t0-t1 - sike_fp2sub(t0, t1, jinv); // jinv = t0-t1 - sike_fp2sqr_mont(t1, t1); // t1 = t1^2 - sike_fp2mul_mont(jinv, t1, jinv); // jinv = jinv*t1 - sike_fp2add(t0, t0, t0); // t0 = t0+t0 - sike_fp2add(t0, t0, t0); // t0 = t0+t0 - sike_fp2sqr_mont(t0, t1); // t1 = t0^2 - sike_fp2mul_mont(t0, t1, t0); // t0 = t0*t1 - sike_fp2add(t0, t0, t0); // t0 = t0+t0 - sike_fp2add(t0, t0, t0); // t0 = t0+t0 - sike_fp2inv_mont(jinv); // jinv = 1/jinv - sike_fp2mul_mont(jinv, t0, jinv); // jinv = t0*jinv -} - - -void xDBLADD(point_proj_t P, point_proj_t Q, const f2elm_t xPQ, const f2elm_t A24) -{ // Simultaneous doubling and differential addition. - // Input: projective Montgomery points P=(XP:ZP) and Q=(XQ:ZQ) such that xP=XP/ZP and xQ=XQ/ZQ, affine difference xPQ=x(P-Q) and Montgomery curve constant A24=(A+2)/4. - // Output: projective Montgomery points P <- 2*P = (X2P:Z2P) such that x(2P)=X2P/Z2P, and Q <- P+Q = (XQP:ZQP) such that = x(Q+P)=XQP/ZQP. - f2elm_t t0, t1, t2; - - sike_fp2add(P->X, P->Z, t0); // t0 = XP+ZP - sike_fp2sub(P->X, P->Z, t1); // t1 = XP-ZP - sike_fp2sqr_mont(t0, P->X); // XP = (XP+ZP)^2 - sike_fp2sub(Q->X, Q->Z, t2); // t2 = XQ-ZQ - sike_fp2correction(t2); - sike_fp2add(Q->X, Q->Z, Q->X); // XQ = XQ+ZQ - sike_fp2mul_mont(t0, t2, t0); // t0 = (XP+ZP)*(XQ-ZQ) - sike_fp2sqr_mont(t1, P->Z); // ZP = (XP-ZP)^2 - sike_fp2mul_mont(t1, Q->X, t1); // t1 = (XP-ZP)*(XQ+ZQ) - sike_fp2sub(P->X, P->Z, t2); // t2 = (XP+ZP)^2-(XP-ZP)^2 - sike_fp2mul_mont(P->X, P->Z, P->X); // XP = (XP+ZP)^2*(XP-ZP)^2 - sike_fp2mul_mont(t2, A24, Q->X); // XQ = A24*[(XP+ZP)^2-(XP-ZP)^2] - sike_fp2sub(t0, t1, Q->Z); // ZQ = (XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ) - sike_fp2add(Q->X, P->Z, P->Z); // ZP = A24*[(XP+ZP)^2-(XP-ZP)^2]+(XP-ZP)^2 - sike_fp2add(t0, t1, Q->X); // XQ = (XP+ZP)*(XQ-ZQ)+(XP-ZP)*(XQ+ZQ) - sike_fp2mul_mont(P->Z, t2, P->Z); // ZP = [A24*[(XP+ZP)^2-(XP-ZP)^2]+(XP-ZP)^2]*[(XP+ZP)^2-(XP-ZP)^2] - sike_fp2sqr_mont(Q->Z, Q->Z); // ZQ = [(XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)]^2 - sike_fp2sqr_mont(Q->X, Q->X); // XQ = [(XP+ZP)*(XQ-ZQ)+(XP-ZP)*(XQ+ZQ)]^2 - sike_fp2mul_mont(Q->Z, xPQ, Q->Z); // ZQ = xPQ*[(XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)]^2 -} diff --git a/src/kem/sike/sike-p434-sha256/isogeny.h b/src/kem/sike/sike-p434-sha256/isogeny.h deleted file mode 100644 index 460c8c66..00000000 --- a/src/kem/sike/sike-p434-sha256/isogeny.h +++ /dev/null @@ -1,49 +0,0 @@ -#ifndef ISOGENY_H_ -#define ISOGENY_H_ - -// Computes [2^e](X:Z) on Montgomery curve with projective -// constant via e repeated doublings. -void xDBLe( - const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, - const f2elm_t C24, size_t e); -// Simultaneous doubling and differential addition. -void xDBLADD( - point_proj_t P, point_proj_t Q, const f2elm_t xPQ, - const f2elm_t A24); -// Tripling of a Montgomery point in projective coordinates (X:Z). -void xTPL( - const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, - const f2elm_t A24plus); -// Computes [3^e](X:Z) on Montgomery curve with projective constant -// via e repeated triplings. -void xTPLe( - const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, - const f2elm_t A24plus, size_t e); -// Given the x-coordinates of P, Q, and R, returns the value A -// corresponding to the Montgomery curve E_A: y^2=x^3+A*x^2+x such that R=Q-P on E_A. -void get_A( - const f2elm_t xP, const f2elm_t xQ, const f2elm_t xR, f2elm_t A); -// Computes the j-invariant of a Montgomery curve with projective constant. -void j_inv( - const f2elm_t A, const f2elm_t C, f2elm_t jinv); -// Computes the corresponding 4-isogeny of a projective Montgomery -// point (X4:Z4) of order 4. -void get_4_isog( - const point_proj_t P, f2elm_t A24plus, f2elm_t C24, f2elm_t* coeff); -// Computes the corresponding 3-isogeny of a projective Montgomery -// point (X3:Z3) of order 3. -void get_3_isog( - const point_proj_t P, f2elm_t A24minus, f2elm_t A24plus, - f2elm_t* coeff); -// Computes the 3-isogeny R=phi(X:Z), given projective point (X3:Z3) -// of order 3 on a Montgomery curve and a point P with coefficients given in coeff. -void eval_3_isog( - point_proj_t Q, f2elm_t* coeff); -// Evaluates the isogeny at the point (X:Z) in the domain of the isogeny. -void eval_4_isog( - point_proj_t P, f2elm_t* coeff); -// 3-way simultaneous inversion -void inv_3_way( - f2elm_t z1, f2elm_t z2, f2elm_t z3); - -#endif // ISOGENY_H_ diff --git a/src/kem/sike/sike-p434-sha256/params.c b/src/kem/sike/sike-p434-sha256/params.c deleted file mode 100644 index b13f4c87..00000000 --- a/src/kem/sike/sike-p434-sha256/params.c +++ /dev/null @@ -1,128 +0,0 @@ -/******************************************************************************************** -* SIDH: an efficient supersingular isogeny cryptography library -* -* Abstract: supersingular isogeny parameters and generation of functions for P434 -*********************************************************************************************/ - -#include "utils.h" - -// Parameters for isogeny system "SIKE" -const struct params_t params = { - .prime = { - U64_TO_WORDS(0xFFFFFFFFFFFFFFFF), U64_TO_WORDS(0xFFFFFFFFFFFFFFFF), - U64_TO_WORDS(0xFFFFFFFFFFFFFFFF), U64_TO_WORDS(0xFDC1767AE2FFFFFF), - U64_TO_WORDS(0x7BC65C783158AEA3), U64_TO_WORDS(0x6CFC5FD681C52056), - U64_TO_WORDS(0x0002341F27177344) - }, - .prime_p1 = { - U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0x0000000000000000), - U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0xFDC1767AE3000000), - U64_TO_WORDS(0x7BC65C783158AEA3), U64_TO_WORDS(0x6CFC5FD681C52056), - U64_TO_WORDS(0x0002341F27177344) - }, - .prime_x2 = { - U64_TO_WORDS(0xFFFFFFFFFFFFFFFE), U64_TO_WORDS(0xFFFFFFFFFFFFFFFF), - U64_TO_WORDS(0xFFFFFFFFFFFFFFFF), U64_TO_WORDS(0xFB82ECF5C5FFFFFF), - U64_TO_WORDS(0xF78CB8F062B15D47), U64_TO_WORDS(0xD9F8BFAD038A40AC), - U64_TO_WORDS(0x0004683E4E2EE688) - }, - .A_gen = { - U64_TO_WORDS(0x05ADF455C5C345BF), U64_TO_WORDS(0x91935C5CC767AC2B), - U64_TO_WORDS(0xAFE4E879951F0257), U64_TO_WORDS(0x70E792DC89FA27B1), - U64_TO_WORDS(0xF797F526BB48C8CD), U64_TO_WORDS(0x2181DB6131AF621F), - U64_TO_WORDS(0x00000A1C08B1ECC4), // XPA0 - U64_TO_WORDS(0x74840EB87CDA7788), U64_TO_WORDS(0x2971AA0ECF9F9D0B), - U64_TO_WORDS(0xCB5732BDF41715D5), U64_TO_WORDS(0x8CD8E51F7AACFFAA), - U64_TO_WORDS(0xA7F424730D7E419F), U64_TO_WORDS(0xD671EB919A179E8C), - U64_TO_WORDS(0x0000FFA26C5A924A), // XPA1 - U64_TO_WORDS(0xFEC6E64588B7273B), U64_TO_WORDS(0xD2A626D74CBBF1C6), - U64_TO_WORDS(0xF8F58F07A78098C7), U64_TO_WORDS(0xE23941F470841B03), - U64_TO_WORDS(0x1B63EDA2045538DD), U64_TO_WORDS(0x735CFEB0FFD49215), - U64_TO_WORDS(0x0001C4CB77542876), // XQA0 - U64_TO_WORDS(0xADB0F733C17FFDD6), U64_TO_WORDS(0x6AFFBD037DA0A050), - U64_TO_WORDS(0x680EC43DB144E02F), U64_TO_WORDS(0x1E2E5D5FF524E374), - U64_TO_WORDS(0xE2DDA115260E2995), U64_TO_WORDS(0xA6E4B552E2EDE508), - U64_TO_WORDS(0x00018ECCDDF4B53E), // XQA1 - U64_TO_WORDS(0x01BA4DB518CD6C7D), U64_TO_WORDS(0x2CB0251FE3CC0611), - U64_TO_WORDS(0x259B0C6949A9121B), U64_TO_WORDS(0x60E17AC16D2F82AD), - U64_TO_WORDS(0x3AA41F1CE175D92D), U64_TO_WORDS(0x413FBE6A9B9BC4F3), - U64_TO_WORDS(0x00022A81D8D55643), // XRA0 - U64_TO_WORDS(0xB8ADBC70FC82E54A), U64_TO_WORDS(0xEF9CDDB0D5FADDED), - U64_TO_WORDS(0x5820C734C80096A0), U64_TO_WORDS(0x7799994BAA96E0E4), - U64_TO_WORDS(0x044961599E379AF8), U64_TO_WORDS(0xDB2B94FBF09F27E2), - U64_TO_WORDS(0x0000B87FC716C0C6) // XRA1 - }, - .B_gen = { - U64_TO_WORDS(0x6E5497556EDD48A3), U64_TO_WORDS(0x2A61B501546F1C05), - U64_TO_WORDS(0xEB919446D049887D), U64_TO_WORDS(0x5864A4A69D450C4F), - U64_TO_WORDS(0xB883F276A6490D2B), U64_TO_WORDS(0x22CC287022D5F5B9), - U64_TO_WORDS(0x0001BED4772E551F), // XPB0 - U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0x0000000000000000), - U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0x0000000000000000), - U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0x0000000000000000), - U64_TO_WORDS(0x0000000000000000), // XPB1 - U64_TO_WORDS(0xFAE2A3F93D8B6B8E), U64_TO_WORDS(0x494871F51700FE1C), - U64_TO_WORDS(0xEF1A94228413C27C), U64_TO_WORDS(0x498FF4A4AF60BD62), - U64_TO_WORDS(0xB00AD2A708267E8A), U64_TO_WORDS(0xF4328294E017837F), - U64_TO_WORDS(0x000034080181D8AE), // XQB0 - U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0x0000000000000000), - U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0x0000000000000000), - U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0x0000000000000000), - U64_TO_WORDS(0x0000000000000000), // XQB1 - U64_TO_WORDS(0x283B34FAFEFDC8E4), U64_TO_WORDS(0x9208F44977C3E647), - U64_TO_WORDS(0x7DEAE962816F4E9A), U64_TO_WORDS(0x68A2BA8AA262EC9D), - U64_TO_WORDS(0x8176F112EA43F45B), U64_TO_WORDS(0x02106D022634F504), - U64_TO_WORDS(0x00007E8A50F02E37), // XRB0 - U64_TO_WORDS(0xB378B7C1DA22CCB1), U64_TO_WORDS(0x6D089C99AD1D9230), - U64_TO_WORDS(0xEBE15711813E2369), U64_TO_WORDS(0x2B35A68239D48A53), - U64_TO_WORDS(0x445F6FD138407C93), U64_TO_WORDS(0xBEF93B29A3F6B54B), - U64_TO_WORDS(0x000173FA910377D3) // XRB1 - }, - .mont_R2 = { - U64_TO_WORDS(0x28E55B65DCD69B30), U64_TO_WORDS(0xACEC7367768798C2), - U64_TO_WORDS(0xAB27973F8311688D), U64_TO_WORDS(0x175CC6AF8D6C7C0B), - U64_TO_WORDS(0xABCD92BF2DDE347E), U64_TO_WORDS(0x69E16A61C7686D9A), - U64_TO_WORDS(0x000025A89BCDD12A) - }, - .mont_one = { - U64_TO_WORDS(0x000000000000742C), U64_TO_WORDS(0x0000000000000000), - U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0xB90FF404FC000000), - U64_TO_WORDS(0xD801A4FB559FACD4), U64_TO_WORDS(0xE93254545F77410C), - U64_TO_WORDS(0x0000ECEEA7BD2EDA) - }, - .mont_six = { - U64_TO_WORDS(0x000000000002B90A), U64_TO_WORDS(0x0000000000000000), - U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0x5ADCCB2822000000), - U64_TO_WORDS(0x187D24F39F0CAFB4), U64_TO_WORDS(0x9D353A4D394145A0), - U64_TO_WORDS(0x00012559A0403298) - }, - .A_strat = { - 0x30, 0x1C, 0x10, 0x08, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, - 0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x08, 0x04, - 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, - 0x02, 0x01, 0x01, 0x0D, 0x07, 0x04, 0x02, 0x01, 0x01, 0x02, - 0x01, 0x01, 0x03, 0x02, 0x01, 0x01, 0x01, 0x01, 0x05, 0x04, - 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x01, - 0x15, 0x0C, 0x07, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, - 0x03, 0x02, 0x01, 0x01, 0x01, 0x01, 0x05, 0x03, 0x02, 0x01, - 0x01, 0x01, 0x01, 0x02, 0x01, 0x01, 0x01, 0x09, 0x05, 0x03, - 0x02, 0x01, 0x01, 0x01, 0x01, 0x02, 0x01, 0x01, 0x01, 0x04, - 0x02, 0x01, 0x01, 0x01, 0x02, 0x01, 0x01 - }, - .B_strat = { - 0x42, 0x21, 0x11, 0x09, 0x05, 0x03, 0x02, 0x01, 0x01, 0x01, - 0x01, 0x02, 0x01, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x01, - 0x02, 0x01, 0x01, 0x08, 0x04, 0x02, 0x01, 0x01, 0x01, 0x02, - 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x10, - 0x08, 0x04, 0x02, 0x01, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04, - 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x08, 0x04, 0x02, 0x01, - 0x01, 0x02, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, - 0x01, 0x20, 0x10, 0x08, 0x04, 0x03, 0x01, 0x01, 0x01, 0x01, - 0x02, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, - 0x08, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04, 0x02, - 0x01, 0x01, 0x02, 0x01, 0x01, 0x10, 0x08, 0x04, 0x02, 0x01, - 0x01, 0x02, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, - 0x01, 0x08, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04, - 0x02, 0x01, 0x01, 0x02, 0x01, 0x01 - } -}; diff --git a/src/kem/sike/sike-p434-sha256/sike.c b/src/kem/sike/sike-p434-sha256/sike.c deleted file mode 100644 index f00ebe76..00000000 --- a/src/kem/sike/sike-p434-sha256/sike.c +++ /dev/null @@ -1,517 +0,0 @@ -/******************************************************************************************** -* SIDH: an efficient supersingular isogeny cryptography library -* -* Abstract: supersingular isogeny key encapsulation (SIKE) protocol -*********************************************************************************************/ - -#include -#include -#include -#include -#include -#include - -#include "utils.h" -#include "isogeny.h" -#include "fpx.h" - -extern const struct params_t params; - -// SIDH_JINV_BYTESZ is a number of bytes used for encoding j-invariant. -#define SIDH_JINV_BYTESZ 110U -// SIDH_PRV_A_BITSZ is a number of bits of SIDH private key (2-isogeny) -#define SIDH_PRV_A_BITSZ 216U -// SIDH_PRV_A_BITSZ is a number of bits of SIDH private key (3-isogeny) -#define SIDH_PRV_B_BITSZ 217U -// MAX_INT_POINTS_ALICE is a number of points used in 2-isogeny tree computation -#define MAX_INT_POINTS_ALICE 7U -// MAX_INT_POINTS_ALICE is a number of points used in 3-isogeny tree computation -#define MAX_INT_POINTS_BOB 8U - -// Swap points. -// If option = 0 then P <- P and Q <- Q, else if option = 0xFF...FF then P <- Q and Q <- P -#if !defined(ARCH_X86_64) || defined(ARCH_GENERIC) -static void sike_cswap(point_proj_t P, point_proj_t Q, const crypto_word_t option) -{ - crypto_word_t temp; - for (size_t i = 0; i < NWORDS_FIELD; i++) { - temp = option & (P->X->c0[i] ^ Q->X->c0[i]); - P->X->c0[i] = temp ^ P->X->c0[i]; - Q->X->c0[i] = temp ^ Q->X->c0[i]; - temp = option & (P->Z->c0[i] ^ Q->Z->c0[i]); - P->Z->c0[i] = temp ^ P->Z->c0[i]; - Q->Z->c0[i] = temp ^ Q->Z->c0[i]; - temp = option & (P->X->c1[i] ^ Q->X->c1[i]); - P->X->c1[i] = temp ^ P->X->c1[i]; - Q->X->c1[i] = temp ^ Q->X->c1[i]; - temp = option & (P->Z->c1[i] ^ Q->Z->c1[i]); - P->Z->c1[i] = temp ^ P->Z->c1[i]; - Q->Z->c1[i] = temp ^ Q->Z->c1[i]; - } -} -#endif - -// Swap points. -// If option = 0 then P <- P and Q <- Q, else if option = 0xFF...FF then P <- Q and Q <- P -static inline void sike_fp2cswap(point_proj_t P, point_proj_t Q, const crypto_word_t option) -{ -#if defined(ARCH_X86_64) && !defined(ARCH_GENERIC) - sike_cswap_asm(P, Q, option); -#else - sike_cswap(P, Q, option); -#endif -} - -static void ladder3Pt( - const f2elm_t xP, const f2elm_t xQ, const f2elm_t xPQ, const uint8_t* m, - int is_A, point_proj_t R, const f2elm_t A) { - point_proj_t R0 = POINT_PROJ_INIT, R2 = POINT_PROJ_INIT; - f2elm_t A24 = F2ELM_INIT; - crypto_word_t mask; - int bit, swap, prevbit = 0; - - const size_t nbits = is_A?SIDH_PRV_A_BITSZ:SIDH_PRV_B_BITSZ; - - // Initializing constant - sike_fpcopy(params.mont_one, A24[0].c0); - sike_fp2add(A24, A24, A24); - sike_fp2add(A, A24, A24); - sike_fp2div2(A24, A24); - sike_fp2div2(A24, A24); // A24 = (A+2)/4 - - // Initializing points - sike_fp2copy(xQ, R0->X); - sike_fpcopy(params.mont_one, R0->Z[0].c0); - sike_fp2copy(xPQ, R2->X); - sike_fpcopy(params.mont_one, R2->Z[0].c0); - sike_fp2copy(xP, R->X); - sike_fpcopy(params.mont_one, R->Z[0].c0); - memset(R->Z->c1, 0, sizeof(R->Z->c1)); - - // Main loop - for (size_t i = 0; i < nbits; i++) { - bit = (m[i >> 3] >> (i & 7)) & 1; - swap = bit ^ prevbit; - prevbit = bit; - mask = 0 - (crypto_word_t)swap; - - sike_fp2cswap(R, R2, mask); - xDBLADD(R0, R2, R->X, A24); - sike_fp2mul_mont(R2->X, R->Z, R2->X); - } - swap = 0 ^ prevbit; - mask = 0 - (crypto_word_t)swap; - sike_fp2cswap(R, R2, mask); -} - -// Initialization of basis points -static inline void sike_init_basis(const crypto_word_t *gen, f2elm_t XP, f2elm_t XQ, f2elm_t XR) { - sike_fpcopy(gen, XP->c0); - sike_fpcopy(gen + NWORDS_FIELD, XP->c1); - sike_fpcopy(gen + 2*NWORDS_FIELD, XQ->c0); - sike_fpcopy(gen + 3*NWORDS_FIELD, XQ->c1); - sike_fpcopy(gen + 4*NWORDS_FIELD, XR->c0); - sike_fpcopy(gen + 5*NWORDS_FIELD, XR->c1); -} - -// Conversion of GF(p^2) element from Montgomery to standard representation. -static inline void sike_fp2_encode(const f2elm_t x, uint8_t *enc) { - f2elm_t t; - sike_from_fp2mont(x, t); - - // convert to bytes in little endian form - for (size_t i=0; i> (8*(i%LSZ))) & 0xFF; - enc[i+FIELD_BYTESZ] = (t[0].c1[i/LSZ] >> (8*(i%LSZ))) & 0xFF; - } -} - -// Parse byte sequence back into GF(p^2) element, and conversion to Montgomery representation. -// Elements over GF(p503) are encoded in 63 octets in little endian format -// (i.e., the least significant octet is located in the lowest memory address). -static inline void fp2_decode(const uint8_t *enc, f2elm_t t) { - memset(t[0].c0, 0, sizeof(t[0].c0)); - memset(t[0].c1, 0, sizeof(t[0].c1)); - // convert bytes in little endian form to f2elm_t - for (size_t i = 0; i < FIELD_BYTESZ; i++) { - t[0].c0[i/LSZ] |= ((crypto_word_t)enc[i+ 0]) << (8*(i%LSZ)); - t[0].c1[i/LSZ] |= ((crypto_word_t)enc[i+FIELD_BYTESZ]) << (8*(i%LSZ)); - } - sike_to_fp2mont(t, t); -} - -// Alice's ephemeral public key generation -// Input: a private key prA in the range [0, 2^250 - 1], stored in 32 bytes. -// Output: the public key pkA consisting of 3 GF(p503^2) elements encoded in 378 bytes. -static void gen_iso_A(const uint8_t* skA, uint8_t* pkA) -{ - point_proj_t R, pts[MAX_INT_POINTS_ALICE]; - point_proj_t phiP = POINT_PROJ_INIT; - point_proj_t phiQ = POINT_PROJ_INIT; - point_proj_t phiR = POINT_PROJ_INIT; - f2elm_t XPA, XQA, XRA, coeff[3]; - f2elm_t A24plus = F2ELM_INIT; - f2elm_t C24 = F2ELM_INIT; - f2elm_t A = F2ELM_INIT; - unsigned int m, index = 0, pts_index[MAX_INT_POINTS_ALICE], npts = 0, ii = 0; - - // Initialize basis points - sike_init_basis(params.A_gen, XPA, XQA, XRA); - sike_init_basis(params.B_gen, phiP->X, phiQ->X, phiR->X); - sike_fpcopy(params.mont_one, (phiP->Z)->c0); - sike_fpcopy(params.mont_one, (phiQ->Z)->c0); - sike_fpcopy(params.mont_one, (phiR->Z)->c0); - - // Initialize constants: A24plus = A+2C, C24 = 4C, where A=6, C=1 - sike_fpcopy(params.mont_one, A24plus->c0); - sike_fp2add(A24plus, A24plus, A24plus); - sike_fp2add(A24plus, A24plus, C24); - sike_fp2add(A24plus, C24, A); - sike_fp2add(C24, C24, A24plus); - - // Retrieve kernel point - ladder3Pt(XPA, XQA, XRA, skA, 1, R, A); - - // Traverse tree - index = 0; - for (size_t row = 1; row < A_max; row++) { - while (index < A_max-row) { - sike_fp2copy(R->X, pts[npts]->X); - sike_fp2copy(R->Z, pts[npts]->Z); - pts_index[npts++] = index; - m = params.A_strat[ii++]; - xDBLe(R, R, A24plus, C24, (2*m)); - index += m; - } - get_4_isog(R, A24plus, C24, coeff); - - for (size_t i = 0; i < npts; i++) { - eval_4_isog(pts[i], coeff); - } - eval_4_isog(phiP, coeff); - eval_4_isog(phiQ, coeff); - eval_4_isog(phiR, coeff); - - sike_fp2copy(pts[npts-1]->X, R->X); - sike_fp2copy(pts[npts-1]->Z, R->Z); - index = pts_index[npts-1]; - npts -= 1; - } - - get_4_isog(R, A24plus, C24, coeff); - eval_4_isog(phiP, coeff); - eval_4_isog(phiQ, coeff); - eval_4_isog(phiR, coeff); - - inv_3_way(phiP->Z, phiQ->Z, phiR->Z); - sike_fp2mul_mont(phiP->X, phiP->Z, phiP->X); - sike_fp2mul_mont(phiQ->X, phiQ->Z, phiQ->X); - sike_fp2mul_mont(phiR->X, phiR->Z, phiR->X); - - // Format public key - sike_fp2_encode(phiP->X, pkA); - sike_fp2_encode(phiQ->X, pkA + SIDH_JINV_BYTESZ); - sike_fp2_encode(phiR->X, pkA + 2*SIDH_JINV_BYTESZ); -} - -// Bob's ephemeral key-pair generation -// It produces a private key skB and computes the public key pkB. -// The private key is an integer in the range [0, 2^Floor(Log(2,3^159)) - 1], stored in 32 bytes. -// The public key consists of 3 GF(p503^2) elements encoded in 378 bytes. -static void gen_iso_B(const uint8_t* skB, uint8_t* pkB) -{ - point_proj_t R, pts[MAX_INT_POINTS_BOB]; - point_proj_t phiP = POINT_PROJ_INIT; - point_proj_t phiQ = POINT_PROJ_INIT; - point_proj_t phiR = POINT_PROJ_INIT; - f2elm_t XPB, XQB, XRB, coeff[3]; - f2elm_t A24plus = F2ELM_INIT; - f2elm_t A24minus = F2ELM_INIT; - f2elm_t A = F2ELM_INIT; - unsigned int m, index = 0, pts_index[MAX_INT_POINTS_BOB], npts = 0, ii = 0; - - // Initialize basis points - sike_init_basis(params.B_gen, XPB, XQB, XRB); - sike_init_basis(params.A_gen, phiP->X, phiQ->X, phiR->X); - sike_fpcopy(params.mont_one, (phiP->Z)->c0); - sike_fpcopy(params.mont_one, (phiQ->Z)->c0); - sike_fpcopy(params.mont_one, (phiR->Z)->c0); - - // Initialize constants: A24minus = A-2C, A24plus = A+2C, where A=6, C=1 - sike_fpcopy(params.mont_one, A24plus->c0); - sike_fp2add(A24plus, A24plus, A24plus); - sike_fp2add(A24plus, A24plus, A24minus); - sike_fp2add(A24plus, A24minus, A); - sike_fp2add(A24minus, A24minus, A24plus); - - // Retrieve kernel point - ladder3Pt(XPB, XQB, XRB, skB, 0, R, A); - - // Traverse tree - index = 0; - for (size_t row = 1; row < B_max; row++) { - while (index < B_max-row) { - sike_fp2copy(R->X, pts[npts]->X); - sike_fp2copy(R->Z, pts[npts]->Z); - pts_index[npts++] = index; - m = params.B_strat[ii++]; - xTPLe(R, R, A24minus, A24plus, m); - index += m; - } - get_3_isog(R, A24minus, A24plus, coeff); - - for (size_t i = 0; i < npts; i++) { - eval_3_isog(pts[i], coeff); - } - eval_3_isog(phiP, coeff); - eval_3_isog(phiQ, coeff); - eval_3_isog(phiR, coeff); - - sike_fp2copy(pts[npts-1]->X, R->X); - sike_fp2copy(pts[npts-1]->Z, R->Z); - index = pts_index[npts-1]; - npts -= 1; - } - - get_3_isog(R, A24minus, A24plus, coeff); - eval_3_isog(phiP, coeff); - eval_3_isog(phiQ, coeff); - eval_3_isog(phiR, coeff); - - inv_3_way(phiP->Z, phiQ->Z, phiR->Z); - sike_fp2mul_mont(phiP->X, phiP->Z, phiP->X); - sike_fp2mul_mont(phiQ->X, phiQ->Z, phiQ->X); - sike_fp2mul_mont(phiR->X, phiR->Z, phiR->X); - - // Format public key - sike_fp2_encode(phiP->X, pkB); - sike_fp2_encode(phiQ->X, pkB + SIDH_JINV_BYTESZ); - sike_fp2_encode(phiR->X, pkB + 2*SIDH_JINV_BYTESZ); -} - -// Alice's ephemeral shared secret computation -// It produces a shared secret key ssA using her secret key skA and Bob's public key pkB -// Inputs: Alice's skA is an integer in the range [0, 2^250 - 1], stored in 32 bytes. -// Bob's pkB consists of 3 GF(p503^2) elements encoded in 378 bytes. -// Output: a shared secret ssA that consists of one element in GF(p503^2) encoded in 126 bytes. -static void ex_iso_A(const uint8_t* skA, const uint8_t* pkB, uint8_t* ssA) -{ - point_proj_t R, pts[MAX_INT_POINTS_ALICE]; - f2elm_t coeff[3], PKB[3], jinv; - f2elm_t A24plus = F2ELM_INIT; - f2elm_t C24 = F2ELM_INIT; - f2elm_t A = F2ELM_INIT; - unsigned int m, index = 0, pts_index[MAX_INT_POINTS_ALICE], npts = 0, ii = 0; - - // Initialize images of Bob's basis - fp2_decode(pkB, PKB[0]); - fp2_decode(pkB + SIDH_JINV_BYTESZ, PKB[1]); - fp2_decode(pkB + 2*SIDH_JINV_BYTESZ, PKB[2]); - - // Initialize constants - get_A(PKB[0], PKB[1], PKB[2], A); - sike_fpadd(params.mont_one, params.mont_one, C24->c0); - sike_fp2add(A, C24, A24plus); - sike_fpadd(C24->c0, C24->c0, C24->c0); - - // Retrieve kernel point - ladder3Pt(PKB[0], PKB[1], PKB[2], skA, 1, R, A); - - // Traverse tree - index = 0; - for (size_t row = 1; row < A_max; row++) { - while (index < A_max-row) { - sike_fp2copy(R->X, pts[npts]->X); - sike_fp2copy(R->Z, pts[npts]->Z); - pts_index[npts++] = index; - m = params.A_strat[ii++]; - xDBLe(R, R, A24plus, C24, (2*m)); - index += m; - } - get_4_isog(R, A24plus, C24, coeff); - - for (size_t i = 0; i < npts; i++) { - eval_4_isog(pts[i], coeff); - } - - sike_fp2copy(pts[npts-1]->X, R->X); - sike_fp2copy(pts[npts-1]->Z, R->Z); - index = pts_index[npts-1]; - npts -= 1; - } - - get_4_isog(R, A24plus, C24, coeff); - sike_fp2add(A24plus, A24plus, A24plus); - sike_fp2sub(A24plus, C24, A24plus); - sike_fp2add(A24plus, A24plus, A24plus); - j_inv(A24plus, C24, jinv); - sike_fp2_encode(jinv, ssA); -} - -// Bob's ephemeral shared secret computation -// It produces a shared secret key ssB using his secret key skB and Alice's public key pkA -// Inputs: Bob's skB is an integer in the range [0, 2^Floor(Log(2,3^159)) - 1], stored in 32 bytes. -// Alice's pkA consists of 3 GF(p503^2) elements encoded in 378 bytes. -// Output: a shared secret ssB that consists of one element in GF(p503^2) encoded in 126 bytes. -static void ex_iso_B(const uint8_t* skB, const uint8_t* pkA, uint8_t* ssB) -{ - point_proj_t R, pts[MAX_INT_POINTS_BOB]; - f2elm_t coeff[3], PKB[3], jinv; - f2elm_t A24plus = F2ELM_INIT; - f2elm_t A24minus = F2ELM_INIT; - f2elm_t A = F2ELM_INIT; - unsigned int m, index = 0, pts_index[MAX_INT_POINTS_BOB], npts = 0, ii = 0; - - // Initialize images of Alice's basis - fp2_decode(pkA, PKB[0]); - fp2_decode(pkA + SIDH_JINV_BYTESZ, PKB[1]); - fp2_decode(pkA + 2*SIDH_JINV_BYTESZ, PKB[2]); - - // Initialize constants - get_A(PKB[0], PKB[1], PKB[2], A); - sike_fpadd(params.mont_one, params.mont_one, A24minus->c0); - sike_fp2add(A, A24minus, A24plus); - sike_fp2sub(A, A24minus, A24minus); - - // Retrieve kernel point - ladder3Pt(PKB[0], PKB[1], PKB[2], skB, 0, R, A); - - // Traverse tree - index = 0; - for (size_t row = 1; row < B_max; row++) { - while (index < B_max-row) { - sike_fp2copy(R->X, pts[npts]->X); - sike_fp2copy(R->Z, pts[npts]->Z); - pts_index[npts++] = index; - m = params.B_strat[ii++]; - xTPLe(R, R, A24minus, A24plus, m); - index += m; - } - get_3_isog(R, A24minus, A24plus, coeff); - - for (size_t i = 0; i < npts; i++) { - eval_3_isog(pts[i], coeff); - } - - sike_fp2copy(pts[npts-1]->X, R->X); - sike_fp2copy(pts[npts-1]->Z, R->Z); - index = pts_index[npts-1]; - npts -= 1; - } - - get_3_isog(R, A24minus, A24plus, coeff); - sike_fp2add(A24plus, A24minus, A); - sike_fp2add(A, A, A); - sike_fp2sub(A24plus, A24minus, A24plus); - j_inv(A, A24plus, jinv); - sike_fp2_encode(jinv, ssB); -} - -int SIKE_keypair(uint8_t out_priv[SIKE_PRV_BYTESZ], - uint8_t out_pub[SIKE_PUB_BYTESZ]) { - // Calculate private key for Alice. Needs to be in range [0, 2^0xFA - 1] and < - // 253 bits - randombytes(out_priv, SIKE_PRV_BYTESZ); - out_priv[31] = (out_priv[31] | 0x01) & 0x03; - - gen_iso_B(out_priv, out_pub); - return 1; -} - -void SIKE_encaps(uint8_t out_shared_key[SIKE_SS_BYTESZ], - uint8_t out_ciphertext[SIKE_CT_BYTESZ], - const uint8_t pub_key[SIKE_PUB_BYTESZ]) { - // Secret buffer is reused by the function to store some ephemeral - // secret data. It's size must be maximum of 64, - // SIKE_MSG_BYTESZ and SIDH_PRV_A_BITSZ in bytes. - uint8_t secret[32]; // OZAPTF, why? - uint8_t j[SIDH_JINV_BYTESZ]; - uint8_t temp[SIKE_MSG_BYTESZ + SIKE_CT_BYTESZ]; - SHA256_CTX ctx; - - // Generate secret key for A - // secret key A = SHA256({0,1}^n || pub_key)) mod SIDH_PRV_A_BITSZ - randombytes(temp, SIKE_MSG_BYTESZ); - - sha256_init(&ctx); - sha256_update(&ctx, temp, SIKE_MSG_BYTESZ); - sha256_update(&ctx, pub_key, SIKE_PUB_BYTESZ); - sha256_final(&ctx, secret); - - // Generate public key for A - first part of the ciphertext - gen_iso_A(secret, out_ciphertext); - - // Generate c1: - // h = SHA256(j-invariant) - // c1 = h ^ m - ex_iso_A(secret, pub_key, j); - sha256_init(&ctx); - sha256_update(&ctx, j, sizeof(j)); - sha256_final(&ctx, secret); - - // c1 = h ^ m - uint8_t *c1 = &out_ciphertext[SIKE_PUB_BYTESZ]; - for (size_t i = 0; i < SIKE_MSG_BYTESZ; i++) { - c1[i] = temp[i] ^ secret[i]; - } - - sha256_init(&ctx); - sha256_update(&ctx, temp, SIKE_MSG_BYTESZ); - sha256_update(&ctx, out_ciphertext, SIKE_CT_BYTESZ); - sha256_final(&ctx, secret); - // Generate shared secret out_shared_key = SHA256(m||out_ciphertext) - memcpy(out_shared_key, secret, SIKE_SS_BYTESZ); -} - -void SIKE_decaps(uint8_t out_shared_key[SIKE_SS_BYTESZ], - const uint8_t ciphertext[SIKE_CT_BYTESZ], - const uint8_t pub_key[SIKE_PUB_BYTESZ], - const uint8_t priv_key[SIKE_PRV_BYTESZ]) { - // Secret buffer is reused by the function to store some ephemeral - // secret data. It's size must be maximum of 64, - // SIKE_MSG_BYTESZ and SIDH_PRV_A_BITSZ in bytes. - uint8_t secret[32]; - uint8_t j[SIDH_JINV_BYTESZ]; - uint8_t c0[SIKE_PUB_BYTESZ]; - uint8_t temp[SIKE_MSG_BYTESZ]; - uint8_t shared_nok[SIKE_MSG_BYTESZ]; - SHA256_CTX ctx; - - // This is OK as we are only using ephemeral keys in BoringSSL - randombytes(shared_nok, SIKE_MSG_BYTESZ); - - // Recover m - // Let ciphertext = c0 || c1 - both have fixed sizes - // m = F(j-invariant(c0, priv_key)) ^ c1 - ex_iso_B(priv_key, ciphertext, j); - - sha256_init(&ctx); - sha256_update(&ctx, j, sizeof(j)); - sha256_final(&ctx, secret); - - const uint8_t *c1 = &ciphertext[sizeof(c0)]; - for (size_t i = 0; i < SIKE_MSG_BYTESZ; i++) { - temp[i] = c1[i] ^ secret[i]; - } - - sha256_init(&ctx); - sha256_update(&ctx, temp, SIKE_MSG_BYTESZ); - sha256_update(&ctx, pub_key, SIKE_PUB_BYTESZ); - sha256_final(&ctx, secret); - - // Recover c0 = public key A - gen_iso_A(secret, c0); - crypto_word_t ok = ct_uint_eq( - ct_mem_eq(c0, ciphertext, SIKE_PUB_BYTESZ), 1); - for (size_t i = 0; i < SIKE_MSG_BYTESZ; i++) { - temp[i] = ct_select_8(ok, temp[i], shared_nok[i]); - } - - sha256_init(&ctx); - sha256_update(&ctx, temp, SIKE_MSG_BYTESZ); - sha256_update(&ctx, ciphertext, SIKE_CT_BYTESZ); - sha256_final(&ctx, secret); - - // Generate shared secret out_shared_key = SHA256(m||ciphertext) - memcpy(out_shared_key, secret, SIKE_SS_BYTESZ); -} diff --git a/src/kem/sike/sike-p434-sha256/utils.h b/src/kem/sike/sike-p434-sha256/utils.h deleted file mode 100644 index 87623d33..00000000 --- a/src/kem/sike/sike-p434-sha256/utils.h +++ /dev/null @@ -1,231 +0,0 @@ -/******************************************************************************************** -* SIDH: an efficient supersingular isogeny cryptography library -* -* Abstract: internal header file for P434 -*********************************************************************************************/ - -#ifndef UTILS_H_ -#define UTILS_H_ - -#include -#include - -// Conversion macro from number of bits to number of bytes -#define BITS_TO_BYTES(nbits) (((nbits)+7)/8) - -// Bit size of the field -#define BITS_FIELD 434 -// Byte size of the field -#define FIELD_BYTESZ BITS_TO_BYTES(BITS_FIELD) -// Number of 64-bit words of a 224-bit element -#define NBITS_ORDER 224 -#define NWORDS64_ORDER ((NBITS_ORDER+63)/64) -// Number of elements in Alice's strategy -#define A_max 108 -// Number of elements in Bob's strategy -#define B_max 137 -// Word size size -#define RADIX sizeof(crypto_word_t)*8 -// Byte size of a limb -#define LSZ sizeof(crypto_word_t) - -#if defined(CPU_64_BIT) - typedef uint64_t crypto_word_t; - // Number of words of a 434-bit field element - #define NWORDS_FIELD 7 - // Number of "0" digits in the least significant part of p434 + 1 - #define ZERO_WORDS 3 - // U64_TO_WORDS expands |x| for a |crypto_word_t| array literal. - #define U64_TO_WORDS(x) UINT64_C(x) -#else - typedef uint32_t crypto_word_t; - // Number of words of a 434-bit field element - #define NWORDS_FIELD 14 - // Number of "0" digits in the least significant part of p434 + 1 - #define ZERO_WORDS 6 - // U64_TO_WORDS expands |x| for a |crypto_word_t| array literal. - #define U64_TO_WORDS(x) \ - (uint32_t)(UINT64_C(x) & 0xffffffff), (uint32_t)(UINT64_C(x) >> 32) -#endif - -// Extended datatype support -#if !defined(HAS_UINT128) - typedef uint64_t uint128_t[2]; -#endif - -// The following functions return 1 (TRUE) if condition is true, 0 (FALSE) otherwise -// Digit multiplication -#define MUL(multiplier, multiplicand, hi, lo) digit_x_digit((multiplier), (multiplicand), &(lo)); - -// If mask |x|==0xff.ff set |x| to 1, otherwise 0 -#define M2B(x) ((x)>>(RADIX-1)) - -// Digit addition with carry -#define ADDC(carryIn, addend1, addend2, carryOut, sumOut) \ -do { \ - crypto_word_t tempReg = (addend1) + (crypto_word_t)(carryIn); \ - (sumOut) = (addend2) + tempReg; \ - (carryOut) = M2B(ct_uint_lt(tempReg, (crypto_word_t)(carryIn)) | \ - ct_uint_lt((sumOut), tempReg)); \ -} while(0) - -// Digit subtraction with borrow -#define SUBC(borrowIn, minuend, subtrahend, borrowOut, differenceOut) \ -do { \ - crypto_word_t tempReg = (minuend) - (subtrahend); \ - crypto_word_t borrowReg = M2B(ct_uint_lt((minuend), (subtrahend))); \ - borrowReg |= ((borrowIn) & ct_uint_eq(tempReg, 0)); \ - (differenceOut) = tempReg - (crypto_word_t)(borrowIn); \ - (borrowOut) = borrowReg; \ -} while(0) - -/* Old GCC 4.9 (jessie) doesn't implement {0} initialization properly, - which violates C11 as described in 6.7.9, 21 (similarily C99, 6.7.8). - Defines below are used to work around the bug, and provide a way - to initialize f2elem_t and point_proj_t structs. - Bug has been fixed in GCC6 (debian stretch). -*/ -#define F2ELM_INIT {{ {0}, {0} }} -#define POINT_PROJ_INIT {{ F2ELM_INIT, F2ELM_INIT }} - -// Datatype for representing 434-bit field elements (448-bit max.) -// Elements over GF(p434) are encoded in 63 octets in little endian format -// (i.e., the least significant octet is located in the lowest memory address). -typedef crypto_word_t felm_t[NWORDS_FIELD]; - -// An element in F_{p^2}, is composed of two coefficients from F_p, * i.e. -// Fp2 element = c0 + c1*i in F_{p^2} -// Datatype for representing double-precision 2x434-bit field elements (448-bit max.) -// Elements (a+b*i) over GF(p434^2), where a and b are defined over GF(p434), are -// encoded as {a, b}, with a in the lowest memory portion. -typedef struct { - felm_t c0; - felm_t c1; -} fp2; - -// Our F_{p^2} element type is a pointer to the struct. -typedef fp2 f2elm_t[1]; - -// Datatype for representing double-precision 2x434-bit -// field elements in contiguous memory. -typedef crypto_word_t dfelm_t[2*NWORDS_FIELD]; - -// Constants used during SIKE computation. -struct params_t { - // Stores a prime - const crypto_word_t prime[NWORDS_FIELD]; - // Stores prime + 1 - const crypto_word_t prime_p1[NWORDS_FIELD]; - // Stores prime * 2 - const crypto_word_t prime_x2[NWORDS_FIELD]; - // Alice's generator values {XPA0 + XPA1*i, XQA0 + XQA1*i, XRA0 + XRA1*i} - // in GF(prime^2), expressed in Montgomery representation - const crypto_word_t A_gen[6*NWORDS_FIELD]; - // Bob's generator values {XPB0 + XPB1*i, XQB0 + XQB1*i, XRB0 + XRB1*i} - // in GF(prime^2), expressed in Montgomery representation - const crypto_word_t B_gen[6*NWORDS_FIELD]; - // Montgomery constant mont_R2 = (2^448)^2 mod prime - const crypto_word_t mont_R2[NWORDS_FIELD]; - // Value 'one' in Montgomery representation - const crypto_word_t mont_one[NWORDS_FIELD]; - // Value '6' in Montgomery representation - const crypto_word_t mont_six[NWORDS_FIELD]; - // Fixed parameters for isogeny tree computation - const unsigned int A_strat[A_max-1]; - const unsigned int B_strat[B_max-1]; -}; - -// Point representation in projective XZ Montgomery coordinates. -typedef struct { - f2elm_t X; - f2elm_t Z; -} point_proj; -typedef point_proj point_proj_t[1]; - -// Checks whether two words are equal. Returns 1 in case it is, -// otherwise 0. -static inline crypto_word_t ct_uint_eq(crypto_word_t x, crypto_word_t y) -{ - // if x==y then t = 0 - crypto_word_t t = x ^ y; - // if x!=y t will have first bit set - t = (t >> 1) - t; - // return MSB - 1 in case x==y, otherwise 0 - return ((~t) >> (RADIX-1)); -} -// Constant time select. -// if pick == 1 (out = in1) -// if pick == 0 (out = in2) -// else out is undefined -static inline uint8_t ct_select_8(uint8_t flag, uint8_t in1, uint8_t in2) { - uint8_t mask = ((int8_t)(flag << 7))>>7; - return (in1&mask) | (in2&(~mask)); -} - -// Constant time memcmp. Returns 1 if p==q, otherwise 0 -static inline int ct_mem_eq(const void *p, const void *q, size_t n) -{ - const uint8_t *pp = (uint8_t*)p, *qq = (uint8_t*)q; - uint8_t a = 0; - - while (n--) a |= *pp++ ^ *qq++; - return (ct_uint_eq(a, 0)); -} - -/* -// Returns 1 if x> (RADIX-1)); -} -*/ - -/// OZAPTF: coppied from boringssl -static inline crypto_word_t constant_time_msb_w(crypto_word_t a) { - return 0u - (a >> (sizeof(a) * 8 - 1)); -} - -// constant_time_lt_w returns 0xff..f if a < b and 0 otherwise. -static inline crypto_word_t ct_uint_lt(crypto_word_t x, crypto_word_t y) -{ - /* - const crypto_word_t t1 = x^y; - const crypto_word_t t2 = x - y; - const crypto_word_t tt = x ^ (t1 | (t2^y)); - return (tt >> (RADIX-1)); - */ - // Consider the two cases of the problem: - // msb(a) == msb(b): a < b iff the MSB of a - b is set. - // msb(a) != msb(b): a < b iff the MSB of b is set. - // - // If msb(a) == msb(b) then the following evaluates as: - // msb(a^((a^b)|((a-b)^a))) == - // msb(a^((a-b) ^ a)) == (because msb(a^b) == 0) - // msb(a^a^(a-b)) == (rearranging) - // msb(a-b) (because ∀x. x^x == 0) - // - // Else, if msb(a) != msb(b) then the following evaluates as: - // msb(a^((a^b)|((a-b)^a))) == - // msb(a^(𝟙 | ((a-b)^a))) == (because msb(a^b) == 1 and 𝟙 - // represents a value s.t. msb(𝟙) = 1) - // msb(a^𝟙) == (because ORing with 1 results in 1) - // msb(b) - // - // - // Here is an SMT-LIB verification of this formula: - // - // (define-fun lt ((a (_ BitVec 32)) (b (_ BitVec 32))) (_ BitVec 32) - // (bvxor a (bvor (bvxor a b) (bvxor (bvsub a b) a))) - // ) - // - // (declare-fun a () (_ BitVec 32)) - // (declare-fun b () (_ BitVec 32)) - // - // (assert (not (= (= #x00000001 (bvlshr (lt a b) #x0000001f)) (bvult a b)))) - // (check-sat) - // (get-model) - return constant_time_msb_w(x^((x^y)|((x-y)^x))); -} -#endif // UTILS_H_ diff --git a/src/rustapi/pqc-sys/src/bindings.rs b/src/rustapi/pqc-sys/src/bindings.rs index 3bcde42d..de93cfad 100644 --- a/src/rustapi/pqc-sys/src/bindings.rs +++ b/src/rustapi/pqc-sys/src/bindings.rs @@ -256,7 +256,8 @@ pub const SABER: ::std::os::raw::c_uint = 15; pub const HQCRMRS128: ::std::os::raw::c_uint = 16; pub const HQCRMRS192: ::std::os::raw::c_uint = 17; pub const HQCRMRS256: ::std::os::raw::c_uint = 18; -pub const PQC_ALG_KEM_MAX: ::std::os::raw::c_uint = 19; +pub const SIKE434: ::std::os::raw::c_uint = 19; +pub const PQC_ALG_KEM_MAX: ::std::os::raw::c_uint = 20; pub type _bindgen_ty_2 = ::std::os::raw::c_uint; #[repr(C)] #[derive(Debug, Copy, Clone)] diff --git a/test/katrunner/src/main.rs b/test/katrunner/src/main.rs index 6e6dad92..f6ee752c 100644 --- a/test/katrunner/src/main.rs +++ b/test/katrunner/src/main.rs @@ -212,6 +212,7 @@ const KATS: &'static[Register] = &[ REG_KEM!(HQCRMRS128, "round3/hqc/hqc-128/hqc-128_kat.rsp"), REG_KEM!(HQCRMRS192, "round3/hqc/hqc-192/hqc-192_kat.rsp"), REG_KEM!(HQCRMRS256, "round3/hqc/hqc-256/hqc-256_kat.rsp"), + REG_KEM!(SIKE434, "round3/sike/PQCkemKAT_374.rsp"), // Those are Round2. KATs are very big, so skip testing until it makes sense to do so. //REG_SIGN!(RAINBOWVCLASSIC), From b2f9d52be5ccf5983561e48ce9ad3a6ae7e2f532 Mon Sep 17 00:00:00 2001 From: Kris Kwiatkowski Date: Fri, 9 Apr 2021 00:44:47 +0100 Subject: [PATCH 03/12] update sike --- src/kem/sike/p434/fp-x86_64.S | 1080 ++++++++++++++++++++++++++++++++ src/kem/sike/p434/fp_generic.c | 173 +++++ src/kem/sike/p434/fpx.c | 282 +++++++++ src/kem/sike/p434/fpx.h | 112 ++++ src/kem/sike/p434/isogeny.c | 262 ++++++++ src/kem/sike/p434/isogeny.h | 49 ++ src/kem/sike/p434/params.c | 128 ++++ src/kem/sike/p434/sike.c | 522 +++++++++++++++ src/kem/sike/p434/utils.h | 214 +++++++ 9 files changed, 2822 insertions(+) create mode 100644 src/kem/sike/p434/fp-x86_64.S create mode 100644 src/kem/sike/p434/fp_generic.c create mode 100644 src/kem/sike/p434/fpx.c create mode 100644 src/kem/sike/p434/fpx.h create mode 100644 src/kem/sike/p434/isogeny.c create mode 100644 src/kem/sike/p434/isogeny.h create mode 100644 src/kem/sike/p434/params.c create mode 100644 src/kem/sike/p434/sike.c create mode 100644 src/kem/sike/p434/utils.h diff --git a/src/kem/sike/p434/fp-x86_64.S b/src/kem/sike/p434/fp-x86_64.S new file mode 100644 index 00000000..f2f32392 --- /dev/null +++ b/src/kem/sike/p434/fp-x86_64.S @@ -0,0 +1,1080 @@ +.text + +.Lp434x2: +.quad 0xFFFFFFFFFFFFFFFE +.quad 0xFFFFFFFFFFFFFFFF +.quad 0xFB82ECF5C5FFFFFF +.quad 0xF78CB8F062B15D47 +.quad 0xD9F8BFAD038A40AC +.quad 0x0004683E4E2EE688 + + +.Lp434p1: +.quad 0xFDC1767AE3000000 +.quad 0x7BC65C783158AEA3 +.quad 0x6CFC5FD681C52056 +.quad 0x0002341F27177344 + +.globl sike_fpadd +.hidden sike_fpadd +.type sike_fpadd,@function +sike_fpadd: +.cfi_startproc + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset r12, -16 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset r13, -24 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset r14, -32 + + xorq %rax,%rax + + movq 0(%rdi),%r8 + addq 0(%rsi),%r8 + movq 8(%rdi),%r9 + adcq 8(%rsi),%r9 + movq 16(%rdi),%r10 + adcq 16(%rsi),%r10 + movq 24(%rdi),%r11 + adcq 24(%rsi),%r11 + movq 32(%rdi),%r12 + adcq 32(%rsi),%r12 + movq 40(%rdi),%r13 + adcq 40(%rsi),%r13 + movq 48(%rdi),%r14 + adcq 48(%rsi),%r14 + + movq .Lp434x2(%rip),%rcx + subq %rcx,%r8 + movq 8+.Lp434x2(%rip),%rcx + sbbq %rcx,%r9 + sbbq %rcx,%r10 + movq 16+.Lp434x2(%rip),%rcx + sbbq %rcx,%r11 + movq 24+.Lp434x2(%rip),%rcx + sbbq %rcx,%r12 + movq 32+.Lp434x2(%rip),%rcx + sbbq %rcx,%r13 + movq 40+.Lp434x2(%rip),%rcx + sbbq %rcx,%r14 + + sbbq $0,%rax + + movq .Lp434x2(%rip),%rdi + andq %rax,%rdi + movq 8+.Lp434x2(%rip),%rsi + andq %rax,%rsi + movq 16+.Lp434x2(%rip),%rcx + andq %rax,%rcx + + addq %rdi,%r8 + movq %r8,0(%rdx) + adcq %rsi,%r9 + movq %r9,8(%rdx) + adcq %rsi,%r10 + movq %r10,16(%rdx) + adcq %rcx,%r11 + movq %r11,24(%rdx) + + setc %cl + movq 24+.Lp434x2(%rip),%r8 + andq %rax,%r8 + movq 32+.Lp434x2(%rip),%r9 + andq %rax,%r9 + movq 40+.Lp434x2(%rip),%r10 + andq %rax,%r10 + btq $0,%rcx + + adcq %r8,%r12 + movq %r12,32(%rdx) + adcq %r9,%r13 + movq %r13,40(%rdx) + adcq %r10,%r14 + movq %r14,48(%rdx) + + popq %r14 +.cfi_adjust_cfa_offset -8 + popq %r13 +.cfi_adjust_cfa_offset -8 + popq %r12 +.cfi_adjust_cfa_offset -8 + .byte 0xf3,0xc3 +.cfi_endproc +.globl sike_cswap_asm +.hidden sike_cswap_asm +.type sike_cswap_asm,@function +sike_cswap_asm: + + + movq %rdx,%xmm3 + + + + + + pshufd $68,%xmm3,%xmm3 + + movdqu 0(%rdi),%xmm0 + movdqu 0(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,0(%rdi) + movdqu %xmm1,0(%rsi) + + movdqu 16(%rdi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,16(%rdi) + movdqu %xmm1,16(%rsi) + + movdqu 32(%rdi),%xmm0 + movdqu 32(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,32(%rdi) + movdqu %xmm1,32(%rsi) + + movdqu 48(%rdi),%xmm0 + movdqu 48(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,48(%rdi) + movdqu %xmm1,48(%rsi) + + movdqu 64(%rdi),%xmm0 + movdqu 64(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,64(%rdi) + movdqu %xmm1,64(%rsi) + + movdqu 80(%rdi),%xmm0 + movdqu 80(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,80(%rdi) + movdqu %xmm1,80(%rsi) + + movdqu 96(%rdi),%xmm0 + movdqu 96(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,96(%rdi) + movdqu %xmm1,96(%rsi) + + movdqu 112(%rdi),%xmm0 + movdqu 112(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,112(%rdi) + movdqu %xmm1,112(%rsi) + + movdqu 128(%rdi),%xmm0 + movdqu 128(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,128(%rdi) + movdqu %xmm1,128(%rsi) + + movdqu 144(%rdi),%xmm0 + movdqu 144(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,144(%rdi) + movdqu %xmm1,144(%rsi) + + movdqu 160(%rdi),%xmm0 + movdqu 160(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,160(%rdi) + movdqu %xmm1,160(%rsi) + + movdqu 176(%rdi),%xmm0 + movdqu 176(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,176(%rdi) + movdqu %xmm1,176(%rsi) + + movdqu 192(%rdi),%xmm0 + movdqu 192(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,192(%rdi) + movdqu %xmm1,192(%rsi) + + movdqu 208(%rdi),%xmm0 + movdqu 208(%rsi),%xmm1 + movdqa %xmm1,%xmm2 + pxor %xmm0,%xmm2 + pand %xmm3,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm2,%xmm1 + movdqu %xmm0,208(%rdi) + movdqu %xmm1,208(%rsi) + + .byte 0xf3,0xc3 +.globl sike_fpsub +.hidden sike_fpsub +.type sike_fpsub,@function +sike_fpsub: +.cfi_startproc + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset r12, -16 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset r13, -24 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset r14, -32 + + xorq %rax,%rax + + movq 0(%rdi),%r8 + subq 0(%rsi),%r8 + movq 8(%rdi),%r9 + sbbq 8(%rsi),%r9 + movq 16(%rdi),%r10 + sbbq 16(%rsi),%r10 + movq 24(%rdi),%r11 + sbbq 24(%rsi),%r11 + movq 32(%rdi),%r12 + sbbq 32(%rsi),%r12 + movq 40(%rdi),%r13 + sbbq 40(%rsi),%r13 + movq 48(%rdi),%r14 + sbbq 48(%rsi),%r14 + + sbbq $0x0,%rax + + movq .Lp434x2(%rip),%rdi + andq %rax,%rdi + movq 8+.Lp434x2(%rip),%rsi + andq %rax,%rsi + movq 16+.Lp434x2(%rip),%rcx + andq %rax,%rcx + + addq %rdi,%r8 + movq %r8,0(%rdx) + adcq %rsi,%r9 + movq %r9,8(%rdx) + adcq %rsi,%r10 + movq %r10,16(%rdx) + adcq %rcx,%r11 + movq %r11,24(%rdx) + + setc %cl + movq 24+.Lp434x2(%rip),%r8 + andq %rax,%r8 + movq 32+.Lp434x2(%rip),%r9 + andq %rax,%r9 + movq 40+.Lp434x2(%rip),%r10 + andq %rax,%r10 + btq $0x0,%rcx + + adcq %r8,%r12 + adcq %r9,%r13 + adcq %r10,%r14 + movq %r12,32(%rdx) + movq %r13,40(%rdx) + movq %r14,48(%rdx) + + popq %r14 +.cfi_adjust_cfa_offset -8 + popq %r13 +.cfi_adjust_cfa_offset -8 + popq %r12 +.cfi_adjust_cfa_offset -8 + .byte 0xf3,0xc3 +.cfi_endproc +.globl sike_mpadd_asm +.hidden sike_mpadd_asm +.type sike_mpadd_asm,@function +sike_mpadd_asm: +.cfi_startproc + movq 0(%rdi),%r8; + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%rcx + addq 0(%rsi),%r8 + adcq 8(%rsi),%r9 + adcq 16(%rsi),%r10 + adcq 24(%rsi),%r11 + adcq 32(%rsi),%rcx + movq %r8,0(%rdx) + movq %r9,8(%rdx) + movq %r10,16(%rdx) + movq %r11,24(%rdx) + movq %rcx,32(%rdx) + + movq 40(%rdi),%r8 + movq 48(%rdi),%r9 + adcq 40(%rsi),%r8 + adcq 48(%rsi),%r9 + movq %r8,40(%rdx) + movq %r9,48(%rdx) + .byte 0xf3,0xc3 +.cfi_endproc +.globl sike_mpsubx2_asm +.hidden sike_mpsubx2_asm +.type sike_mpsubx2_asm,@function +sike_mpsubx2_asm: +.cfi_startproc + xorq %rax,%rax + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%rcx + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + movq %r8,0(%rdx) + movq %r9,8(%rdx) + movq %r10,16(%rdx) + movq %r11,24(%rdx) + movq %rcx,32(%rdx) + + movq 40(%rdi),%r8 + movq 48(%rdi),%r9 + movq 56(%rdi),%r10 + movq 64(%rdi),%r11 + movq 72(%rdi),%rcx + sbbq 40(%rsi),%r8 + sbbq 48(%rsi),%r9 + sbbq 56(%rsi),%r10 + sbbq 64(%rsi),%r11 + sbbq 72(%rsi),%rcx + movq %r8,40(%rdx) + movq %r9,48(%rdx) + movq %r10,56(%rdx) + movq %r11,64(%rdx) + movq %rcx,72(%rdx) + + movq 80(%rdi),%r8 + movq 88(%rdi),%r9 + movq 96(%rdi),%r10 + movq 104(%rdi),%r11 + sbbq 80(%rsi),%r8 + sbbq 88(%rsi),%r9 + sbbq 96(%rsi),%r10 + sbbq 104(%rsi),%r11 + sbbq $0x0,%rax + movq %r8,80(%rdx) + movq %r9,88(%rdx) + movq %r10,96(%rdx) + movq %r11,104(%rdx) + .byte 0xf3,0xc3 +.cfi_endproc +.globl sike_mpdblsubx2_asm +.hidden sike_mpdblsubx2_asm +.type sike_mpdblsubx2_asm,@function +sike_mpdblsubx2_asm: +.cfi_startproc + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset r12, -16 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset r13, -24 + + xorq %rax,%rax + + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq 32(%rdx),%r12 + movq 40(%rdx),%r13 + movq 48(%rdx),%rcx + subq 0(%rdi),%r8 + sbbq 8(%rdi),%r9 + sbbq 16(%rdi),%r10 + sbbq 24(%rdi),%r11 + sbbq 32(%rdi),%r12 + sbbq 40(%rdi),%r13 + sbbq 48(%rdi),%rcx + adcq $0x0,%rax + + + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%r12 + sbbq 40(%rsi),%r13 + sbbq 48(%rsi),%rcx + adcq $0x0,%rax + + + movq %r8,0(%rdx) + movq %r9,8(%rdx) + movq %r10,16(%rdx) + movq %r11,24(%rdx) + movq %r12,32(%rdx) + movq %r13,40(%rdx) + movq %rcx,48(%rdx) + + + movq 56(%rdx),%r8 + movq 64(%rdx),%r9 + movq 72(%rdx),%r10 + movq 80(%rdx),%r11 + movq 88(%rdx),%r12 + movq 96(%rdx),%r13 + movq 104(%rdx),%rcx + + subq %rax,%r8 + sbbq 56(%rdi),%r8 + sbbq 64(%rdi),%r9 + sbbq 72(%rdi),%r10 + sbbq 80(%rdi),%r11 + sbbq 88(%rdi),%r12 + sbbq 96(%rdi),%r13 + sbbq 104(%rdi),%rcx + + + subq 56(%rsi),%r8 + sbbq 64(%rsi),%r9 + sbbq 72(%rsi),%r10 + sbbq 80(%rsi),%r11 + sbbq 88(%rsi),%r12 + sbbq 96(%rsi),%r13 + sbbq 104(%rsi),%rcx + + + movq %r8,56(%rdx) + movq %r9,64(%rdx) + movq %r10,72(%rdx) + movq %r11,80(%rdx) + movq %r12,88(%rdx) + movq %r13,96(%rdx) + movq %rcx,104(%rdx) + + popq %r13 +.cfi_adjust_cfa_offset -8 + popq %r12 +.cfi_adjust_cfa_offset -8 + .byte 0xf3,0xc3 +.cfi_endproc + +.globl sike_fprdc +.hidden sike_fprdc +.type sike_fprdc,@function +sike_fprdc: +.cfi_startproc + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset r12, -16 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset r13, -24 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset r14, -32 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset r15, -40 + + xorq %rax,%rax + movq 0+0(%rdi),%rdx + mulxq 0+.Lp434p1(%rip),%r8,%r9 + mulxq 8+.Lp434p1(%rip),%r12,%r10 + mulxq 16+.Lp434p1(%rip),%r13,%r11 + + adoxq %r12,%r9 + adoxq %r13,%r10 + + mulxq 24+.Lp434p1(%rip),%r13,%r12 + adoxq %r13,%r11 + adoxq %rax,%r12 + + xorq %rax,%rax + movq 0+8(%rdi),%rdx + mulxq 0+.Lp434p1(%rip),%r13,%rcx + adcxq %r13,%r9 + adcxq %rcx,%r10 + + mulxq 8+.Lp434p1(%rip),%rcx,%r13 + adcxq %r13,%r11 + adoxq %rcx,%r10 + + mulxq 16+.Lp434p1(%rip),%rcx,%r13 + adcxq %r13,%r12 + adoxq %rcx,%r11 + + mulxq 24+.Lp434p1(%rip),%rcx,%r13 + adcxq %rax,%r13 + adoxq %rcx,%r12 + adoxq %rax,%r13 + + xorq %rcx,%rcx + addq 24(%rdi),%r8 + adcq 32(%rdi),%r9 + adcq 40(%rdi),%r10 + adcq 48(%rdi),%r11 + adcq 56(%rdi),%r12 + adcq 64(%rdi),%r13 + adcq 72(%rdi),%rcx + movq %r8,24(%rdi) + movq %r9,32(%rdi) + movq %r10,40(%rdi) + movq %r11,48(%rdi) + movq %r12,56(%rdi) + movq %r13,64(%rdi) + movq %rcx,72(%rdi) + movq 80(%rdi),%r8 + movq 88(%rdi),%r9 + movq 96(%rdi),%r10 + movq 104(%rdi),%r11 + adcq $0x0,%r8 + adcq $0x0,%r9 + adcq $0x0,%r10 + adcq $0x0,%r11 + movq %r8,80(%rdi) + movq %r9,88(%rdi) + movq %r10,96(%rdi) + movq %r11,104(%rdi) + + xorq %rax,%rax + movq 16+0(%rdi),%rdx + mulxq 0+.Lp434p1(%rip),%r8,%r9 + mulxq 8+.Lp434p1(%rip),%r12,%r10 + mulxq 16+.Lp434p1(%rip),%r13,%r11 + + adoxq %r12,%r9 + adoxq %r13,%r10 + + mulxq 24+.Lp434p1(%rip),%r13,%r12 + adoxq %r13,%r11 + adoxq %rax,%r12 + + xorq %rax,%rax + movq 16+8(%rdi),%rdx + mulxq 0+.Lp434p1(%rip),%r13,%rcx + adcxq %r13,%r9 + adcxq %rcx,%r10 + + mulxq 8+.Lp434p1(%rip),%rcx,%r13 + adcxq %r13,%r11 + adoxq %rcx,%r10 + + mulxq 16+.Lp434p1(%rip),%rcx,%r13 + adcxq %r13,%r12 + adoxq %rcx,%r11 + + mulxq 24+.Lp434p1(%rip),%rcx,%r13 + adcxq %rax,%r13 + adoxq %rcx,%r12 + adoxq %rax,%r13 + + xorq %rcx,%rcx + addq 40(%rdi),%r8 + adcq 48(%rdi),%r9 + adcq 56(%rdi),%r10 + adcq 64(%rdi),%r11 + adcq 72(%rdi),%r12 + adcq 80(%rdi),%r13 + adcq 88(%rdi),%rcx + movq %r8,40(%rdi) + movq %r9,48(%rdi) + movq %r10,56(%rdi) + movq %r11,64(%rdi) + movq %r12,72(%rdi) + movq %r13,80(%rdi) + movq %rcx,88(%rdi) + movq 96(%rdi),%r8 + movq 104(%rdi),%r9 + adcq $0x0,%r8 + adcq $0x0,%r9 + movq %r8,96(%rdi) + movq %r9,104(%rdi) + + xorq %rax,%rax + movq 32+0(%rdi),%rdx + mulxq 0+.Lp434p1(%rip),%r8,%r9 + mulxq 8+.Lp434p1(%rip),%r12,%r10 + mulxq 16+.Lp434p1(%rip),%r13,%r11 + + adoxq %r12,%r9 + adoxq %r13,%r10 + + mulxq 24+.Lp434p1(%rip),%r13,%r12 + adoxq %r13,%r11 + adoxq %rax,%r12 + + xorq %rax,%rax + movq 32+8(%rdi),%rdx + mulxq 0+.Lp434p1(%rip),%r13,%rcx + adcxq %r13,%r9 + adcxq %rcx,%r10 + + mulxq 8+.Lp434p1(%rip),%rcx,%r13 + adcxq %r13,%r11 + adoxq %rcx,%r10 + + mulxq 16+.Lp434p1(%rip),%rcx,%r13 + adcxq %r13,%r12 + adoxq %rcx,%r11 + + mulxq 24+.Lp434p1(%rip),%rcx,%r13 + adcxq %rax,%r13 + adoxq %rcx,%r12 + adoxq %rax,%r13 + + xorq %rcx,%rcx + addq 56(%rdi),%r8 + adcq 64(%rdi),%r9 + adcq 72(%rdi),%r10 + adcq 80(%rdi),%r11 + adcq 88(%rdi),%r12 + adcq 96(%rdi),%r13 + adcq 104(%rdi),%rcx + movq %r8,0(%rsi) + movq %r9,8(%rsi) + movq %r10,72(%rdi) + movq %r11,80(%rdi) + movq %r12,88(%rdi) + movq %r13,96(%rdi) + movq %rcx,104(%rdi) + + xorq %rax,%rax + movq 48(%rdi),%rdx + mulxq 0+.Lp434p1(%rip),%r8,%r9 + mulxq 8+.Lp434p1(%rip),%r12,%r10 + mulxq 16+.Lp434p1(%rip),%r13,%r11 + + adoxq %r12,%r9 + adoxq %r13,%r10 + + mulxq 24+.Lp434p1(%rip),%r13,%r12 + adoxq %r13,%r11 + adoxq %rax,%r12 + + addq 72(%rdi),%r8 + adcq 80(%rdi),%r9 + adcq 88(%rdi),%r10 + adcq 96(%rdi),%r11 + adcq 104(%rdi),%r12 + movq %r8,16(%rsi) + movq %r9,24(%rsi) + movq %r10,32(%rsi) + movq %r11,40(%rsi) + movq %r12,48(%rsi) + + + popq %r15 +.cfi_adjust_cfa_offset -8 + popq %r14 +.cfi_adjust_cfa_offset -8 + popq %r13 +.cfi_adjust_cfa_offset -8 + popq %r12 +.cfi_adjust_cfa_offset -8 + .byte 0xf3,0xc3 +.cfi_endproc +.globl sike_mpmul +.hidden sike_mpmul +.type sike_mpmul,@function +sike_mpmul: +.cfi_startproc + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset r12, -16 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset r13, -24 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset r14, -32 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset r15, -40 + + + movq %rdx,%rcx + xorq %rax,%rax + + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset rbx, -48 + pushq %rbp +.cfi_offset rbp, -56 +.cfi_adjust_cfa_offset 8 + subq $96,%rsp +.cfi_adjust_cfa_offset 96 + + addq 32(%rdi),%r8 + adcq 40(%rdi),%r9 + adcq 48(%rdi),%r10 + adcq $0x0,%r11 + sbbq $0x0,%rax + movq %r8,0(%rsp) + movq %r9,8(%rsp) + movq %r10,16(%rsp) + movq %r11,24(%rsp) + + + xorq %rbx,%rbx + movq 0(%rsi),%r12 + movq 8(%rsi),%r13 + movq 16(%rsi),%r14 + movq 24(%rsi),%r15 + addq 32(%rsi),%r12 + adcq 40(%rsi),%r13 + adcq 48(%rsi),%r14 + adcq $0x0,%r15 + sbbq $0x0,%rbx + movq %r12,32(%rsp) + movq %r13,40(%rsp) + movq %r14,48(%rsp) + movq %r15,56(%rsp) + + + andq %rax,%r12 + andq %rax,%r13 + andq %rax,%r14 + andq %rax,%r15 + + + andq %rbx,%r8 + andq %rbx,%r9 + andq %rbx,%r10 + andq %rbx,%r11 + + + addq %r12,%r8 + adcq %r13,%r9 + adcq %r14,%r10 + adcq %r15,%r11 + movq %r8,64(%rsp) + movq %r9,72(%rsp) + movq %r10,80(%rsp) + movq %r11,88(%rsp) + + + movq 0+0(%rsp),%rdx + mulxq 32+0(%rsp),%r9,%r8 + movq %r9,0+0(%rsp) + mulxq 32+8(%rsp),%r10,%r9 + xorq %rax,%rax + adoxq %r10,%r8 + mulxq 32+16(%rsp),%r11,%r10 + adoxq %r11,%r9 + mulxq 32+24(%rsp),%r12,%r11 + adoxq %r12,%r10 + + movq 0+8(%rsp),%rdx + mulxq 32+0(%rsp),%r12,%r13 + adoxq %rax,%r11 + xorq %rax,%rax + mulxq 32+8(%rsp),%r15,%r14 + adoxq %r8,%r12 + movq %r12,0+8(%rsp) + adcxq %r15,%r13 + mulxq 32+16(%rsp),%rbx,%r15 + adcxq %rbx,%r14 + adoxq %r9,%r13 + mulxq 32+24(%rsp),%rbp,%rbx + adcxq %rbp,%r15 + adcxq %rax,%rbx + adoxq %r10,%r14 + + movq 0+16(%rsp),%rdx + mulxq 32+0(%rsp),%r8,%r9 + adoxq %r11,%r15 + adoxq %rax,%rbx + xorq %rax,%rax + mulxq 32+8(%rsp),%r11,%r10 + adoxq %r13,%r8 + movq %r8,0+16(%rsp) + adcxq %r11,%r9 + mulxq 32+16(%rsp),%r12,%r11 + adcxq %r12,%r10 + adoxq %r14,%r9 + mulxq 32+24(%rsp),%rbp,%r12 + adcxq %rbp,%r11 + adcxq %rax,%r12 + + adoxq %r15,%r10 + adoxq %rbx,%r11 + adoxq %rax,%r12 + + movq 0+24(%rsp),%rdx + mulxq 32+0(%rsp),%r8,%r13 + xorq %rax,%rax + mulxq 32+8(%rsp),%r15,%r14 + adcxq %r15,%r13 + adoxq %r8,%r9 + mulxq 32+16(%rsp),%rbx,%r15 + adcxq %rbx,%r14 + adoxq %r13,%r10 + mulxq 32+24(%rsp),%rbp,%rbx + adcxq %rbp,%r15 + adcxq %rax,%rbx + adoxq %r14,%r11 + adoxq %r15,%r12 + adoxq %rax,%rbx + movq %r9,0+24(%rsp) + movq %r10,0+32(%rsp) + movq %r11,0+40(%rsp) + movq %r12,0+48(%rsp) + movq %rbx,0+56(%rsp) + + + + movq 0+0(%rdi),%rdx + mulxq 0+0(%rsi),%r9,%r8 + movq %r9,0+0(%rcx) + mulxq 0+8(%rsi),%r10,%r9 + xorq %rax,%rax + adoxq %r10,%r8 + mulxq 0+16(%rsi),%r11,%r10 + adoxq %r11,%r9 + mulxq 0+24(%rsi),%r12,%r11 + adoxq %r12,%r10 + + movq 0+8(%rdi),%rdx + mulxq 0+0(%rsi),%r12,%r13 + adoxq %rax,%r11 + xorq %rax,%rax + mulxq 0+8(%rsi),%r15,%r14 + adoxq %r8,%r12 + movq %r12,0+8(%rcx) + adcxq %r15,%r13 + mulxq 0+16(%rsi),%rbx,%r15 + adcxq %rbx,%r14 + adoxq %r9,%r13 + mulxq 0+24(%rsi),%rbp,%rbx + adcxq %rbp,%r15 + adcxq %rax,%rbx + adoxq %r10,%r14 + + movq 0+16(%rdi),%rdx + mulxq 0+0(%rsi),%r8,%r9 + adoxq %r11,%r15 + adoxq %rax,%rbx + xorq %rax,%rax + mulxq 0+8(%rsi),%r11,%r10 + adoxq %r13,%r8 + movq %r8,0+16(%rcx) + adcxq %r11,%r9 + mulxq 0+16(%rsi),%r12,%r11 + adcxq %r12,%r10 + adoxq %r14,%r9 + mulxq 0+24(%rsi),%rbp,%r12 + adcxq %rbp,%r11 + adcxq %rax,%r12 + + adoxq %r15,%r10 + adoxq %rbx,%r11 + adoxq %rax,%r12 + + movq 0+24(%rdi),%rdx + mulxq 0+0(%rsi),%r8,%r13 + xorq %rax,%rax + mulxq 0+8(%rsi),%r15,%r14 + adcxq %r15,%r13 + adoxq %r8,%r9 + mulxq 0+16(%rsi),%rbx,%r15 + adcxq %rbx,%r14 + adoxq %r13,%r10 + mulxq 0+24(%rsi),%rbp,%rbx + adcxq %rbp,%r15 + adcxq %rax,%rbx + adoxq %r14,%r11 + adoxq %r15,%r12 + adoxq %rax,%rbx + movq %r9,0+24(%rcx) + movq %r10,0+32(%rcx) + movq %r11,0+40(%rcx) + movq %r12,0+48(%rcx) + movq %rbx,0+56(%rcx) + + + + movq 32+0(%rdi),%rdx + mulxq 32+0(%rsi),%r9,%r8 + movq %r9,64+0(%rcx) + mulxq 32+8(%rsi),%r10,%r9 + xorq %rax,%rax + adoxq %r10,%r8 + mulxq 32+16(%rsi),%r11,%r10 + adoxq %r11,%r9 + + movq 32+8(%rdi),%rdx + mulxq 32+0(%rsi),%r12,%r11 + adoxq %rax,%r10 + xorq %rax,%rax + + mulxq 32+8(%rsi),%r14,%r13 + adoxq %r8,%r12 + movq %r12,64+8(%rcx) + adcxq %r14,%r11 + + mulxq 32+16(%rsi),%r8,%r14 + adoxq %r9,%r11 + adcxq %r8,%r13 + adcxq %rax,%r14 + adoxq %r10,%r13 + + movq 32+16(%rdi),%rdx + mulxq 32+0(%rsi),%r8,%r9 + adoxq %rax,%r14 + xorq %rax,%rax + + mulxq 32+8(%rsi),%r10,%r12 + adoxq %r11,%r8 + movq %r8,64+16(%rcx) + adcxq %r13,%r9 + + mulxq 32+16(%rsi),%r11,%r8 + adcxq %r14,%r12 + adcxq %rax,%r8 + adoxq %r10,%r9 + adoxq %r12,%r11 + adoxq %rax,%r8 + movq %r9,64+24(%rcx) + movq %r11,64+32(%rcx) + movq %r8,64+40(%rcx) + + + + + movq 64(%rsp),%r8 + movq 72(%rsp),%r9 + movq 80(%rsp),%r10 + movq 88(%rsp),%r11 + + movq 32(%rsp),%rax + addq %rax,%r8 + movq 40(%rsp),%rax + adcq %rax,%r9 + movq 48(%rsp),%rax + adcq %rax,%r10 + movq 56(%rsp),%rax + adcq %rax,%r11 + + + movq 0(%rsp),%r12 + movq 8(%rsp),%r13 + movq 16(%rsp),%r14 + movq 24(%rsp),%r15 + subq 0(%rcx),%r12 + sbbq 8(%rcx),%r13 + sbbq 16(%rcx),%r14 + sbbq 24(%rcx),%r15 + sbbq 32(%rcx),%r8 + sbbq 40(%rcx),%r9 + sbbq 48(%rcx),%r10 + sbbq 56(%rcx),%r11 + + + subq 64(%rcx),%r12 + sbbq 72(%rcx),%r13 + sbbq 80(%rcx),%r14 + sbbq 88(%rcx),%r15 + sbbq 96(%rcx),%r8 + sbbq 104(%rcx),%r9 + sbbq $0x0,%r10 + sbbq $0x0,%r11 + + addq 32(%rcx),%r12 + movq %r12,32(%rcx) + adcq 40(%rcx),%r13 + movq %r13,40(%rcx) + adcq 48(%rcx),%r14 + movq %r14,48(%rcx) + adcq 56(%rcx),%r15 + movq %r15,56(%rcx) + adcq 64(%rcx),%r8 + movq %r8,64(%rcx) + adcq 72(%rcx),%r9 + movq %r9,72(%rcx) + adcq 80(%rcx),%r10 + movq %r10,80(%rcx) + adcq 88(%rcx),%r11 + movq %r11,88(%rcx) + movq 96(%rcx),%r12 + adcq $0x0,%r12 + movq %r12,96(%rcx) + movq 104(%rcx),%r13 + adcq $0x0,%r13 + movq %r13,104(%rcx) + + addq $96,%rsp +.cfi_adjust_cfa_offset -96 + popq %rbp +.cfi_adjust_cfa_offset -8 +.cfi_same_value rbp + popq %rbx +.cfi_adjust_cfa_offset -8 +.cfi_same_value rbx + + + popq %r15 +.cfi_adjust_cfa_offset -8 + popq %r14 +.cfi_adjust_cfa_offset -8 + popq %r13 +.cfi_adjust_cfa_offset -8 + popq %r12 +.cfi_adjust_cfa_offset -8 + .byte 0xf3,0xc3 +.cfi_endproc diff --git a/src/kem/sike/p434/fp_generic.c b/src/kem/sike/p434/fp_generic.c new file mode 100644 index 00000000..02e851cf --- /dev/null +++ b/src/kem/sike/p434/fp_generic.c @@ -0,0 +1,173 @@ +/******************************************************************************************** +* SIDH: an efficient supersingular isogeny cryptography library +* +* Abstract: portable modular arithmetic for P503 +*********************************************************************************************/ +#include "utils.h" +#include "fpx.h" + +// Global constants +extern const struct params_t params; + +static void digit_x_digit(const crypto_word_t a, const crypto_word_t b, crypto_word_t* c) +{ // Digit multiplication, digit * digit -> 2-digit result + crypto_word_t al, ah, bl, bh, temp; + crypto_word_t albl, albh, ahbl, ahbh, res1, res2, res3, carry; + crypto_word_t mask_low = (crypto_word_t)(-1) >> (sizeof(crypto_word_t)*4); + crypto_word_t mask_high = (crypto_word_t)(-1) << (sizeof(crypto_word_t)*4); + + al = a & mask_low; // Low part + ah = a >> (sizeof(crypto_word_t) * 4); // High part + bl = b & mask_low; + bh = b >> (sizeof(crypto_word_t) * 4); + + albl = al*bl; + albh = al*bh; + ahbl = ah*bl; + ahbh = ah*bh; + c[0] = albl & mask_low; // C00 + + res1 = albl >> (sizeof(crypto_word_t) * 4); + res2 = ahbl & mask_low; + res3 = albh & mask_low; + temp = res1 + res2 + res3; + carry = temp >> (sizeof(crypto_word_t) * 4); + c[0] ^= temp << (sizeof(crypto_word_t) * 4); // C01 + + res1 = ahbl >> (sizeof(crypto_word_t) * 4); + res2 = albh >> (sizeof(crypto_word_t) * 4); + res3 = ahbh & mask_low; + temp = res1 + res2 + res3 + carry; + c[1] = temp & mask_low; // C10 + carry = temp & mask_high; + c[1] ^= (ahbh & mask_high) + carry; // C11 +} + +void sike_fpadd(const felm_t a, const felm_t b, felm_t c) +{ // Modular addition, c = a+b mod p434. + // Inputs: a, b in [0, 2*p434-1] + // Output: c in [0, 2*p434-1] + unsigned int i, carry = 0; + crypto_word_t mask; + + for (i = 0; i < NWORDS_FIELD; i++) { + ADDC(carry, a[i], b[i], carry, c[i]); + } + + carry = 0; + for (i = 0; i < NWORDS_FIELD; i++) { + SUBC(carry, c[i], params.prime_x2[i], carry, c[i]); + } + mask = 0 - (crypto_word_t)carry; + + carry = 0; + for (i = 0; i < NWORDS_FIELD; i++) { + ADDC(carry, c[i], params.prime_x2[i] & mask, carry, c[i]); + } +} + +void sike_fpsub(const felm_t a, const felm_t b, felm_t c) +{ // Modular subtraction, c = a-b mod p434. + // Inputs: a, b in [0, 2*p434-1] + // Output: c in [0, 2*p434-1] + unsigned int i, borrow = 0; + crypto_word_t mask; + + for (i = 0; i < NWORDS_FIELD; i++) { + SUBC(borrow, a[i], b[i], borrow, c[i]); + } + mask = 0 - (crypto_word_t)borrow; + + borrow = 0; + for (i = 0; i < NWORDS_FIELD; i++) { + ADDC(borrow, c[i], params.prime_x2[i] & mask, borrow, c[i]); + } +} + +void sike_mpmul(const felm_t a, const felm_t b, dfelm_t c) +{ // Multiprecision comba multiply, c = a*b, where lng(a) = lng(b) = NWORDS_FIELD. + unsigned int i, j; + crypto_word_t t = 0, u = 0, v = 0, UV[2]; + unsigned int carry = 0; + + for (i = 0; i < NWORDS_FIELD; i++) { + for (j = 0; j <= i; j++) { + MUL(a[j], b[i-j], UV+1, UV[0]); + ADDC(0, UV[0], v, carry, v); + ADDC(carry, UV[1], u, carry, u); + t += carry; + } + c[i] = v; + v = u; + u = t; + t = 0; + } + + for (i = NWORDS_FIELD; i < 2*NWORDS_FIELD-1; i++) { + for (j = i-NWORDS_FIELD+1; j < NWORDS_FIELD; j++) { + MUL(a[j], b[i-j], UV+1, UV[0]); + ADDC(0, UV[0], v, carry, v); + ADDC(carry, UV[1], u, carry, u); + t += carry; + } + c[i] = v; + v = u; + u = t; + t = 0; + } + c[2*NWORDS_FIELD-1] = v; +} + +void sike_fprdc(const felm_t ma, felm_t mc) +{ // Efficient Montgomery reduction using comba and exploiting the special form of the prime p434. + // mc = ma*R^-1 mod p434x2, where R = 2^448. + // If ma < 2^448*p434, the output mc is in the range [0, 2*p434-1]. + // ma is assumed to be in Montgomery representation. + unsigned int i, j, carry, count = ZERO_WORDS; + crypto_word_t UV[2], t = 0, u = 0, v = 0; + + for (i = 0; i < NWORDS_FIELD; i++) { + mc[i] = 0; + } + + for (i = 0; i < NWORDS_FIELD; i++) { + for (j = 0; j < i; j++) { + if (j < (i-ZERO_WORDS+1)) { + MUL(mc[j], params.prime_p1[i-j], UV+1, UV[0]); + ADDC(0, UV[0], v, carry, v); + ADDC(carry, UV[1], u, carry, u); + t += carry; + } + } + ADDC(0, v, ma[i], carry, v); + ADDC(carry, u, 0, carry, u); + t += carry; + mc[i] = v; + v = u; + u = t; + t = 0; + } + + for (i = NWORDS_FIELD; i < 2*NWORDS_FIELD-1; i++) { + if (count > 0) { + count -= 1; + } + for (j = i-NWORDS_FIELD+1; j < NWORDS_FIELD; j++) { + if (j < (NWORDS_FIELD-count)) { + MUL(mc[j], params.prime_p1[i-j], UV+1, UV[0]); + ADDC(0, UV[0], v, carry, v); + ADDC(carry, UV[1], u, carry, u); + t += carry; + } + } + ADDC(0, v, ma[i], carry, v); + ADDC(carry, u, 0, carry, u); + t += carry; + mc[i-NWORDS_FIELD] = v; + v = u; + u = t; + t = 0; + } + ADDC(0, v, ma[2*NWORDS_FIELD-1], carry, v); + mc[NWORDS_FIELD-1] = v; +} diff --git a/src/kem/sike/p434/fpx.c b/src/kem/sike/p434/fpx.c new file mode 100644 index 00000000..30233406 --- /dev/null +++ b/src/kem/sike/p434/fpx.c @@ -0,0 +1,282 @@ +/******************************************************************************************** +* SIDH: an efficient supersingular isogeny cryptography library +* +* Abstract: core functions over GF(p) and GF(p^2) +*********************************************************************************************/ +#include +#include "utils.h" +#include "fpx.h" + +extern const struct params_t params; + +// Multiprecision squaring, c = a^2 mod p. +static void fpsqr_mont(const felm_t ma, felm_t mc) +{ + dfelm_t temp = {0}; + sike_mpmul(ma, ma, temp); + sike_fprdc(temp, mc); +} + +// Chain to compute a^(p-3)/4 using Montgomery arithmetic. +static void fpinv_chain_mont(felm_t a) +{ + unsigned int i, j; + felm_t t[31], tt; + + // Precomputed table + fpsqr_mont(a, tt); + sike_fpmul_mont(a, tt, t[0]); + for (i = 0; i <= 29; i++) sike_fpmul_mont(t[i], tt, t[i+1]); + + sike_fpcopy(a, tt); + for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[5], tt, tt); + for (i = 0; i < 10; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[14], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[3], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[23], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[13], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[24], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[7], tt, tt); + for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[12], tt, tt); + for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[30], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[1], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[30], tt, tt); + for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[21], tt, tt); + for (i = 0; i < 9; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[2], tt, tt); + for (i = 0; i < 9; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[19], tt, tt); + for (i = 0; i < 9; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[1], tt, tt); + for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[24], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[26], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[16], tt, tt); + for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[10], tt, tt); + for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[6], tt, tt); + for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[0], tt, tt); + for (i = 0; i < 9; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[20], tt, tt); + for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[9], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[25], tt, tt); + for (i = 0; i < 9; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[30], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[26], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(a, tt, tt); + for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[28], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[6], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[10], tt, tt); + for (i = 0; i < 9; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[22], tt, tt); + for (j = 0; j < 35; j++) { + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + sike_fpmul_mont(t[30], tt, tt); + } + sike_fpcopy(tt, a); +} + +// Field inversion using Montgomery arithmetic, a = a^(-1)*R mod p. +static void fpinv_mont(felm_t a) +{ + felm_t tt = {0}; + sike_fpcopy(a, tt); + fpinv_chain_mont(tt); + fpsqr_mont(tt, tt); + fpsqr_mont(tt, tt); + sike_fpmul_mont(a, tt, a); +} + +// Multiprecision addition, c = a+b, where lng(a) = lng(b) = nwords. Returns the carry bit. +#if defined(ARCH_GENERIC) || (!defined(ARCH_X86_64) && !defined(ARCH_AARCH64)) +inline static unsigned int mp_add(const felm_t a, const felm_t b, felm_t c, const unsigned int nwords) { + uint8_t carry = 0; + for (size_t i = 0; i < nwords; i++) { + ADDC(carry, a[i], b[i], carry, c[i]); + } + return carry; +} + +// Multiprecision subtraction, c = a-b, where lng(a) = lng(b) = nwords. Returns the borrow bit. +inline static unsigned int mp_sub(const felm_t a, const felm_t b, felm_t c, const unsigned int nwords) { + uint32_t borrow = 0; + for (size_t i = 0; i < nwords; i++) { + SUBC(borrow, a[i], b[i], borrow, c[i]); + } + return borrow; +} +#endif + +// Multiprecision addition, c = a+b. +inline static void mp_addfast(const felm_t a, const felm_t b, felm_t c) +{ +#if defined(ARCH_GENERIC) || (!defined(ARCH_X86_64) && !defined(ARCH_AARCH64)) + mp_add(a, b, c, NWORDS_FIELD); +#else + sike_mpadd_asm(a, b, c); +#endif +} + +// Multiprecision subtraction, c = a-b, where lng(a) = lng(b) = 2*NWORDS_FIELD. +// If c < 0 then returns mask = 0xFF..F, else mask = 0x00..0 +inline static crypto_word_t mp_subfast(const dfelm_t a, const dfelm_t b, dfelm_t c) { +#if defined(ARCH_GENERIC) || (!defined(ARCH_X86_64) && !defined(ARCH_AARCH64)) + return (0 - (crypto_word_t)mp_sub(a, b, c, 2*NWORDS_FIELD)); +#else + return sike_mpsubx2_asm(a, b, c); +#endif +} + +// Multiprecision subtraction, c = c-a-b, where lng(a) = lng(b) = 2*NWORDS_FIELD. +// Inputs should be s.t. c > a and c > b +inline static void mp_dblsubfast(const dfelm_t a, const dfelm_t b, dfelm_t c) { +#if defined(ARCH_GENERIC) || (!defined(ARCH_X86_64) && !defined(ARCH_AARCH64)) + mp_sub(c, a, c, 2*NWORDS_FIELD); + mp_sub(c, b, c, 2*NWORDS_FIELD); +#else + sike_mpdblsubx2_asm(a, b, c); +#endif +} + +// Copy a field element, c = a. +void sike_fpcopy(const felm_t a, felm_t c) { + for (size_t i = 0; i < NWORDS_FIELD; i++) { + c[i] = a[i]; + } +} + +// Field multiplication using Montgomery arithmetic, c = a*b*R^-1 mod prime, where R=2^768 +void sike_fpmul_mont(const felm_t ma, const felm_t mb, felm_t mc) +{ + dfelm_t temp = {0}; + sike_mpmul(ma, mb, temp); + sike_fprdc(temp, mc); +} + +// Conversion from Montgomery representation to standard representation, +// c = ma*R^(-1) mod p = a mod p, where ma in [0, p-1]. +void sike_from_mont(const felm_t ma, felm_t c) +{ + felm_t one = {0}; + one[0] = 1; + + sike_fpmul_mont(ma, one, c); + sike_fpcorrection(c); +} + +// GF(p^2) squaring using Montgomery arithmetic, c = a^2 in GF(p^2). +// Inputs: a = a0+a1*i, where a0, a1 are in [0, 2*p-1] +// Output: c = c0+c1*i, where c0, c1 are in [0, 2*p-1] +void sike_fp2sqr_mont(const f2elm_t a, f2elm_t c) { + felm_t t1, t2, t3; + + mp_addfast(a->c0, a->c1, t1); // t1 = a0+a1 + sike_fpsub(a->c0, a->c1, t2); // t2 = a0-a1 + mp_addfast(a->c0, a->c0, t3); // t3 = 2a0 + sike_fpmul_mont(t1, t2, c->c0); // c0 = (a0+a1)(a0-a1) + sike_fpmul_mont(t3, a->c1, c->c1); // c1 = 2a0*a1 +} + +// Modular negation, a = -a mod p503. +// Input/output: a in [0, 2*p503-1] +void sike_fpneg(felm_t a) { + uint32_t borrow = 0; + for (size_t i = 0; i < NWORDS_FIELD; i++) { + SUBC(borrow, params.prime_x2[i], a[i], borrow, a[i]); + } +} + +// Modular division by two, c = a/2 mod p503. +// Input : a in [0, 2*p503-1] +// Output: c in [0, 2*p503-1] +void sike_fpdiv2(const felm_t a, felm_t c) { + uint32_t carry = 0; + crypto_word_t mask; + + mask = 0 - (crypto_word_t)(a[0] & 1); // If a is odd compute a+p503 + for (size_t i = 0; i < NWORDS_FIELD; i++) { + ADDC(carry, a[i], params.prime[i] & mask, carry, c[i]); + } + + // Multiprecision right shift by one. + for (size_t i = 0; i < NWORDS_FIELD-1; i++) { + c[i] = (c[i] >> 1) ^ (c[i+1] << (RADIX - 1)); + } + c[NWORDS_FIELD-1] >>= 1; +} + +// Modular correction to reduce field element a in [0, 2*p503-1] to [0, p503-1]. +void sike_fpcorrection(felm_t a) { + uint32_t borrow = 0; + crypto_word_t mask; + + for (size_t i = 0; i < NWORDS_FIELD; i++) { + SUBC(borrow, a[i], params.prime[i], borrow, a[i]); + } + mask = 0 - (crypto_word_t)borrow; + + borrow = 0; + for (size_t i = 0; i < NWORDS_FIELD; i++) { + ADDC(borrow, a[i], params.prime[i] & mask, borrow, a[i]); + } +} + +// GF(p^2) multiplication using Montgomery arithmetic, c = a*b in GF(p^2). +// Inputs: a = a0+a1*i and b = b0+b1*i, where a0, a1, b0, b1 are in [0, 2*p-1] +// Output: c = c0+c1*i, where c0, c1 are in [0, 2*p-1] +void sike_fp2mul_mont(const f2elm_t a, const f2elm_t b, f2elm_t c) { + felm_t t1, t2; + dfelm_t tt1, tt2, tt3; + crypto_word_t mask; + + mp_addfast(a->c0, a->c1, t1); // t1 = a0+a1 + mp_addfast(b->c0, b->c1, t2); // t2 = b0+b1 + sike_mpmul(a->c0, b->c0, tt1); // tt1 = a0*b0 + sike_mpmul(a->c1, b->c1, tt2); // tt2 = a1*b1 + sike_mpmul(t1, t2, tt3); // tt3 = (a0+a1)*(b0+b1) + mp_dblsubfast(tt1, tt2, tt3); // tt3 = (a0+a1)*(b0+b1) - a0*b0 - a1*b1 + mask = mp_subfast(tt1, tt2, tt1); // tt1 = a0*b0 - a1*b1. If tt1 < 0 then mask = 0xFF..F, else if tt1 >= 0 then mask = 0x00..0 + + for (size_t i = 0; i < NWORDS_FIELD; i++) { + t1[i] = params.prime[i] & mask; + } + + sike_fprdc(tt3, c->c1); // c[1] = (a0+a1)*(b0+b1) - a0*b0 - a1*b1 + mp_addfast(&tt1[NWORDS_FIELD], t1, &tt1[NWORDS_FIELD]); + sike_fprdc(tt1, c->c0); // c[0] = a0*b0 - a1*b1 +} + +// GF(p^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2). +void sike_fp2inv_mont(f2elm_t a) { + f2elm_t t1; + + fpsqr_mont(a->c0, t1->c0); // t10 = a0^2 + fpsqr_mont(a->c1, t1->c1); // t11 = a1^2 + sike_fpadd(t1->c0, t1->c1, t1->c0); // t10 = a0^2+a1^2 + fpinv_mont(t1->c0); // t10 = (a0^2+a1^2)^-1 + sike_fpneg(a->c1); // a = a0-i*a1 + sike_fpmul_mont(a->c0, t1->c0, a->c0); + sike_fpmul_mont(a->c1, t1->c0, a->c1); // a = (a0-i*a1)*(a0^2+a1^2)^-1 +} diff --git a/src/kem/sike/p434/fpx.h b/src/kem/sike/p434/fpx.h new file mode 100644 index 00000000..b9255ac7 --- /dev/null +++ b/src/kem/sike/p434/fpx.h @@ -0,0 +1,112 @@ +#ifndef FPX_H_ +#define FPX_H_ + +#include "utils.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +// Modular addition, c = a+b mod p. +void sike_fpadd(const felm_t a, const felm_t b, felm_t c); +// Modular subtraction, c = a-b mod p. +void sike_fpsub(const felm_t a, const felm_t b, felm_t c); +// Modular division by two, c = a/2 mod p. +void sike_fpdiv2(const felm_t a, felm_t c); +// Modular correction to reduce field element a in [0, 2*p-1] to [0, p-1]. +void sike_fpcorrection(felm_t a); +// Multiprecision multiply, c = a*b, where lng(a) = lng(b) = nwords. +void sike_mpmul(const felm_t a, const felm_t b, dfelm_t c); +// 443-bit Montgomery reduction, c = a mod p +void sike_fprdc(const dfelm_t a, felm_t c); +// Double 2x443-bit multiprecision subtraction, c = c-a-b +void sike_mpdblsubx2_asm(const felm_t a, const felm_t b, felm_t c); +// Multiprecision subtraction, c = a-b +crypto_word_t sike_mpsubx2_asm(const dfelm_t a, const dfelm_t b, dfelm_t c); +// 443-bit multiprecision addition, c = a+b +void sike_mpadd_asm(const felm_t a, const felm_t b, felm_t c); +// Modular negation, a = -a mod p. +void sike_fpneg(felm_t a); +// Copy of a field element, c = a +void sike_fpcopy(const felm_t a, felm_t c); +// Copy a field element, c = a. +void sike_fpzero(felm_t a); +// If option = 0xFF...FF x=y; y=x, otherwise swap doesn't happen. Constant time. +void sike_cswap_asm(point_proj_t x, point_proj_t y, const crypto_word_t option); +// Conversion from Montgomery representation to standard representation, +// c = ma*R^(-1) mod p = a mod p, where ma in [0, p-1]. +void sike_from_mont(const felm_t ma, felm_t c); +// Field multiplication using Montgomery arithmetic, c = a*b*R^-1 mod p443, where R=2^768 +void sike_fpmul_mont(const felm_t ma, const felm_t mb, felm_t mc); +// GF(p443^2) multiplication using Montgomery arithmetic, c = a*b in GF(p443^2) +void sike_fp2mul_mont(const f2elm_t a, const f2elm_t b, f2elm_t c); +// GF(p443^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2) +void sike_fp2inv_mont(f2elm_t a); +// GF(p^2) squaring using Montgomery arithmetic, c = a^2 in GF(p^2). +void sike_fp2sqr_mont(const f2elm_t a, f2elm_t c); +// Modular correction, a = a in GF(p^2). +void sike_fp2correction(f2elm_t a); + +#if defined(__cplusplus) +} // extern C +#endif + +// GF(p^2) addition, c = a+b in GF(p^2). +#define sike_fp2add(a, b, c) \ +do { \ + sike_fpadd(a->c0, b->c0, c->c0); \ + sike_fpadd(a->c1, b->c1, c->c1); \ +} while(0) + +// GF(p^2) subtraction, c = a-b in GF(p^2). +#define sike_fp2sub(a,b,c) \ +do { \ + sike_fpsub(a->c0, b->c0, c->c0); \ + sike_fpsub(a->c1, b->c1, c->c1); \ +} while(0) + +// Copy a GF(p^2) element, c = a. +#define sike_fp2copy(a, c) \ +do { \ + sike_fpcopy(a->c0, c->c0); \ + sike_fpcopy(a->c1, c->c1); \ +} while(0) + +// GF(p^2) negation, a = -a in GF(p^2). +#define sike_fp2neg(a) \ +do { \ + sike_fpneg(a->c0); \ + sike_fpneg(a->c1); \ +} while(0) + +// GF(p^2) division by two, c = a/2 in GF(p^2). +#define sike_fp2div2(a, c) \ +do { \ + sike_fpdiv2(a->c0, c->c0); \ + sike_fpdiv2(a->c1, c->c1); \ +} while(0) + +// Modular correction, a = a in GF(p^2). +#define sike_fp2correction(a) \ +do { \ + sike_fpcorrection(a->c0); \ + sike_fpcorrection(a->c1); \ +} while(0) + +// Conversion of a GF(p^2) element to Montgomery representation, +// mc_i = a_i*R^2*R^(-1) = a_i*R in GF(p^2). +#define sike_to_fp2mont(a, mc) \ +do { \ + sike_fpmul_mont(a->c0, params.mont_R2, mc->c0); \ + sike_fpmul_mont(a->c1, params.mont_R2, mc->c1); \ +} while(0) + +// Conversion of a GF(p^2) element from Montgomery representation to standard representation, +// c_i = ma_i*R^(-1) = a_i in GF(p^2). +#define sike_from_fp2mont(ma, c) \ +do { \ + sike_from_mont(ma->c0, c->c0); \ + sike_from_mont(ma->c1, c->c1); \ +} while(0) + +#endif // FPX_H_ diff --git a/src/kem/sike/p434/isogeny.c b/src/kem/sike/p434/isogeny.c new file mode 100644 index 00000000..661410e4 --- /dev/null +++ b/src/kem/sike/p434/isogeny.c @@ -0,0 +1,262 @@ +/******************************************************************************************** +* SIDH: an efficient supersingular isogeny cryptography library +* +* Abstract: elliptic curve and isogeny functions +*********************************************************************************************/ +#include +#include +#include "utils.h" +#include "isogeny.h" +#include "fpx.h" + +static void xDBL(const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, const f2elm_t C24) +{ // Doubling of a Montgomery point in projective coordinates (X:Z). + // Input: projective Montgomery x-coordinates P = (X1:Z1), where x1=X1/Z1 and Montgomery curve constants A+2C and 4C. + // Output: projective Montgomery x-coordinates Q = 2*P = (X2:Z2). + f2elm_t t0, t1; + + sike_fp2sub(P->X, P->Z, t0); // t0 = X1-Z1 + sike_fp2add(P->X, P->Z, t1); // t1 = X1+Z1 + sike_fp2sqr_mont(t0, t0); // t0 = (X1-Z1)^2 + sike_fp2sqr_mont(t1, t1); // t1 = (X1+Z1)^2 + sike_fp2mul_mont(C24, t0, Q->Z); // Z2 = C24*(X1-Z1)^2 + sike_fp2mul_mont(t1, Q->Z, Q->X); // X2 = C24*(X1-Z1)^2*(X1+Z1)^2 + sike_fp2sub(t1, t0, t1); // t1 = (X1+Z1)^2-(X1-Z1)^2 + sike_fp2mul_mont(A24plus, t1, t0); // t0 = A24plus*[(X1+Z1)^2-(X1-Z1)^2] + sike_fp2add(Q->Z, t0, Q->Z); // Z2 = A24plus*[(X1+Z1)^2-(X1-Z1)^2] + C24*(X1-Z1)^2 + sike_fp2mul_mont(Q->Z, t1, Q->Z); // Z2 = [A24plus*[(X1+Z1)^2-(X1-Z1)^2] + C24*(X1-Z1)^2]*[(X1+Z1)^2-(X1-Z1)^2] +} + +void xDBLe(const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, const f2elm_t C24, size_t e) +{ // Computes [2^e](X:Z) on Montgomery curve with projective constant via e repeated doublings. + // Input: projective Montgomery x-coordinates P = (XP:ZP), such that xP=XP/ZP and Montgomery curve constants A+2C and 4C. + // Output: projective Montgomery x-coordinates Q <- (2^e)*P. + + memmove(Q, P, sizeof(*P)); + for (size_t i = 0; i < e; i++) { + xDBL(Q, Q, A24plus, C24); + } +} + +void get_4_isog(const point_proj_t P, f2elm_t A24plus, f2elm_t C24, f2elm_t* coeff) +{ // Computes the corresponding 4-isogeny of a projective Montgomery point (X4:Z4) of order 4. + // Input: projective point of order four P = (X4:Z4). + // Output: the 4-isogenous Montgomery curve with projective coefficients A+2C/4C and the 3 coefficients + // that are used to evaluate the isogeny at a point in eval_4_isog(). + + sike_fp2sub(P->X, P->Z, coeff[1]); // coeff[1] = X4-Z4 + sike_fp2add(P->X, P->Z, coeff[2]); // coeff[2] = X4+Z4 + sike_fp2sqr_mont(P->Z, coeff[0]); // coeff[0] = Z4^2 + sike_fp2add(coeff[0], coeff[0], coeff[0]); // coeff[0] = 2*Z4^2 + sike_fp2sqr_mont(coeff[0], C24); // C24 = 4*Z4^4 + sike_fp2add(coeff[0], coeff[0], coeff[0]); // coeff[0] = 4*Z4^2 + sike_fp2sqr_mont(P->X, A24plus); // A24plus = X4^2 + sike_fp2add(A24plus, A24plus, A24plus); // A24plus = 2*X4^2 + sike_fp2sqr_mont(A24plus, A24plus); // A24plus = 4*X4^4 +} + +void eval_4_isog(point_proj_t P, f2elm_t* coeff) +{ // Evaluates the isogeny at the point (X:Z) in the domain of the isogeny, given a 4-isogeny phi defined + // by the 3 coefficients in coeff (computed in the function get_4_isog()). + // Inputs: the coefficients defining the isogeny, and the projective point P = (X:Z). + // Output: the projective point P = phi(P) = (X:Z) in the codomain. + f2elm_t t0, t1; + + sike_fp2add(P->X, P->Z, t0); // t0 = X+Z + sike_fp2sub(P->X, P->Z, t1); // t1 = X-Z + sike_fp2mul_mont(t0, coeff[1], P->X); // X = (X+Z)*coeff[1] + sike_fp2mul_mont(t1, coeff[2], P->Z); // Z = (X-Z)*coeff[2] + sike_fp2mul_mont(t0, t1, t0); // t0 = (X+Z)*(X-Z) + sike_fp2mul_mont(t0, coeff[0], t0); // t0 = coeff[0]*(X+Z)*(X-Z) + sike_fp2add(P->X, P->Z, t1); // t1 = (X-Z)*coeff[2] + (X+Z)*coeff[1] + sike_fp2sub(P->X, P->Z, P->Z); // Z = (X-Z)*coeff[2] - (X+Z)*coeff[1] + sike_fp2sqr_mont(t1, t1); // t1 = [(X-Z)*coeff[2] + (X+Z)*coeff[1]]^2 + sike_fp2sqr_mont(P->Z, P->Z); // Z = [(X-Z)*coeff[2] - (X+Z)*coeff[1]]^2 + sike_fp2add(t1, t0, P->X); // X = coeff[0]*(X+Z)*(X-Z) + [(X-Z)*coeff[2] + (X+Z)*coeff[1]]^2 + sike_fp2sub(P->Z, t0, t0); // t0 = [(X-Z)*coeff[2] - (X+Z)*coeff[1]]^2 - coeff[0]*(X+Z)*(X-Z) + sike_fp2mul_mont(P->X, t1, P->X); // Xfinal + sike_fp2mul_mont(P->Z, t0, P->Z); // Zfinal +} + + +void xTPL(const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, const f2elm_t A24plus) +{ // Tripling of a Montgomery point in projective coordinates (X:Z). + // Input: projective Montgomery x-coordinates P = (X:Z), where x=X/Z and Montgomery curve constants A24plus = A+2C and A24minus = A-2C. + // Output: projective Montgomery x-coordinates Q = 3*P = (X3:Z3). + f2elm_t t0, t1, t2, t3, t4, t5, t6; + + sike_fp2sub(P->X, P->Z, t0); // t0 = X-Z + sike_fp2sqr_mont(t0, t2); // t2 = (X-Z)^2 + sike_fp2add(P->X, P->Z, t1); // t1 = X+Z + sike_fp2sqr_mont(t1, t3); // t3 = (X+Z)^2 + sike_fp2add(t0, t1, t4); // t4 = 2*X + sike_fp2sub(t1, t0, t0); // t0 = 2*Z + sike_fp2sqr_mont(t4, t1); // t1 = 4*X^2 + sike_fp2sub(t1, t3, t1); // t1 = 4*X^2 - (X+Z)^2 + sike_fp2sub(t1, t2, t1); // t1 = 4*X^2 - (X+Z)^2 - (X-Z)^2 + sike_fp2mul_mont(t3, A24plus, t5); // t5 = A24plus*(X+Z)^2 + sike_fp2mul_mont(t3, t5, t3); // t3 = A24plus*(X+Z)^3 + sike_fp2mul_mont(A24minus, t2, t6); // t6 = A24minus*(X-Z)^2 + sike_fp2mul_mont(t2, t6, t2); // t2 = A24minus*(X-Z)^3 + sike_fp2sub(t2, t3, t3); // t3 = A24minus*(X-Z)^3 - coeff*(X+Z)^3 + sike_fp2sub(t5, t6, t2); // t2 = A24plus*(X+Z)^2 - A24minus*(X-Z)^2 + sike_fp2mul_mont(t1, t2, t1); // t1 = [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] + sike_fp2add(t3, t1, t2); // t2 = [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] + A24minus*(X-Z)^3 - coeff*(X+Z)^3 + sike_fp2sqr_mont(t2, t2); // t2 = t2^2 + sike_fp2mul_mont(t4, t2, Q->X); // X3 = 2*X*t2 + sike_fp2sub(t3, t1, t1); // t1 = A24minus*(X-Z)^3 - A24plus*(X+Z)^3 - [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] + sike_fp2sqr_mont(t1, t1); // t1 = t1^2 + sike_fp2mul_mont(t0, t1, Q->Z); // Z3 = 2*Z*t1 +} + +void xTPLe(const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, const f2elm_t A24plus, size_t e) +{ // Computes [3^e](X:Z) on Montgomery curve with projective constant via e repeated triplings. + // Input: projective Montgomery x-coordinates P = (XP:ZP), such that xP=XP/ZP and Montgomery curve constants A24plus = A+2C and A24minus = A-2C. + // Output: projective Montgomery x-coordinates Q <- (3^e)*P. + memmove(Q, P, sizeof(*P)); + for (size_t i = 0; i < e; i++) { + xTPL(Q, Q, A24minus, A24plus); + } +} + +void get_3_isog(const point_proj_t P, f2elm_t A24minus, f2elm_t A24plus, f2elm_t* coeff) +{ // Computes the corresponding 3-isogeny of a projective Montgomery point (X3:Z3) of order 3. + // Input: projective point of order three P = (X3:Z3). + // Output: the 3-isogenous Montgomery curve with projective coefficient A/C. + f2elm_t t0, t1, t2, t3, t4; + + sike_fp2sub(P->X, P->Z, coeff[0]); // coeff0 = X-Z + sike_fp2sqr_mont(coeff[0], t0); // t0 = (X-Z)^2 + sike_fp2add(P->X, P->Z, coeff[1]); // coeff1 = X+Z + sike_fp2sqr_mont(coeff[1], t1); // t1 = (X+Z)^2 + sike_fp2add(t0, t1, t2); // t2 = (X+Z)^2 + (X-Z)^2 + sike_fp2add(coeff[0], coeff[1], t3); // t3 = 2*X + sike_fp2sqr_mont(t3, t3); // t3 = 4*X^2 + sike_fp2sub(t3, t2, t3); // t3 = 4*X^2 - (X+Z)^2 - (X-Z)^2 + sike_fp2add(t1, t3, t2); // t2 = 4*X^2 - (X-Z)^2 + sike_fp2add(t3, t0, t3); // t3 = 4*X^2 - (X+Z)^2 + sike_fp2add(t0, t3, t4); // t4 = 4*X^2 - (X+Z)^2 + (X-Z)^2 + sike_fp2add(t4, t4, t4); // t4 = 2(4*X^2 - (X+Z)^2 + (X-Z)^2) + sike_fp2add(t1, t4, t4); // t4 = 8*X^2 - (X+Z)^2 + 2*(X-Z)^2 + sike_fp2mul_mont(t2, t4, A24minus); // A24minus = [4*X^2 - (X-Z)^2]*[8*X^2 - (X+Z)^2 + 2*(X-Z)^2] + sike_fp2add(t1, t2, t4); // t4 = 4*X^2 + (X+Z)^2 - (X-Z)^2 + sike_fp2add(t4, t4, t4); // t4 = 2(4*X^2 + (X+Z)^2 - (X-Z)^2) + sike_fp2add(t0, t4, t4); // t4 = 8*X^2 + 2*(X+Z)^2 - (X-Z)^2 + sike_fp2mul_mont(t3, t4, t4); // t4 = [4*X^2 - (X+Z)^2]*[8*X^2 + 2*(X+Z)^2 - (X-Z)^2] + sike_fp2sub(t4, A24minus, t0); // t0 = [4*X^2 - (X+Z)^2]*[8*X^2 + 2*(X+Z)^2 - (X-Z)^2] - [4*X^2 - (X-Z)^2]*[8*X^2 - (X+Z)^2 + 2*(X-Z)^2] + sike_fp2add(A24minus, t0, A24plus); // A24plus = 8*X^2 - (X+Z)^2 + 2*(X-Z)^2 +} + + +void eval_3_isog(point_proj_t Q, f2elm_t* coeff) +{ // Computes the 3-isogeny R=phi(X:Z), given projective point (X3:Z3) of order 3 on a Montgomery curve and + // a point P with 2 coefficients in coeff (computed in the function get_3_isog()). + // Inputs: projective points P = (X3:Z3) and Q = (X:Z). + // Output: the projective point Q <- phi(Q) = (X3:Z3). + f2elm_t t0, t1, t2; + + sike_fp2add(Q->X, Q->Z, t0); // t0 = X+Z + sike_fp2sub(Q->X, Q->Z, t1); // t1 = X-Z + sike_fp2mul_mont(t0, coeff[0], t0); // t0 = coeff0*(X+Z) + sike_fp2mul_mont(t1, coeff[1], t1); // t1 = coeff1*(X-Z) + sike_fp2add(t0, t1, t2); // t2 = coeff0*(X+Z) + coeff1*(X-Z) + sike_fp2sub(t1, t0, t0); // t0 = coeff1*(X-Z) - coeff0*(X+Z) + sike_fp2sqr_mont(t2, t2); // t2 = [coeff0*(X+Z) + coeff1*(X-Z)]^2 + sike_fp2sqr_mont(t0, t0); // t0 = [coeff1*(X-Z) - coeff0*(X+Z)]^2 + sike_fp2mul_mont(Q->X, t2, Q->X); // X3final = X*[coeff0*(X+Z) + coeff1*(X-Z)]^2 + sike_fp2mul_mont(Q->Z, t0, Q->Z); // Z3final = Z*[coeff1*(X-Z) - coeff0*(X+Z)]^2 +} + + +void inv_3_way(f2elm_t z1, f2elm_t z2, f2elm_t z3) +{ // 3-way simultaneous inversion + // Input: z1,z2,z3 + // Output: 1/z1,1/z2,1/z3 (override inputs). + f2elm_t t0, t1, t2, t3; + + sike_fp2mul_mont(z1, z2, t0); // t0 = z1*z2 + sike_fp2mul_mont(z3, t0, t1); // t1 = z1*z2*z3 + sike_fp2inv_mont(t1); // t1 = 1/(z1*z2*z3) + sike_fp2mul_mont(z3, t1, t2); // t2 = 1/(z1*z2) + sike_fp2mul_mont(t2, z2, t3); // t3 = 1/z1 + sike_fp2mul_mont(t2, z1, z2); // z2 = 1/z2 + sike_fp2mul_mont(t0, t1, z3); // z3 = 1/z3 + sike_fp2copy(t3, z1); // z1 = 1/z1 +} + + +void get_A(const f2elm_t xP, const f2elm_t xQ, const f2elm_t xR, f2elm_t A) +{ // Given the x-coordinates of P, Q, and R, returns the value A corresponding to the Montgomery curve E_A: y^2=x^3+A*x^2+x such that R=Q-P on E_A. + // Input: the x-coordinates xP, xQ, and xR of the points P, Q and R. + // Output: the coefficient A corresponding to the curve E_A: y^2=x^3+A*x^2+x. + f2elm_t t0, t1, one = F2ELM_INIT; + + extern const struct params_t params; + sike_fpcopy(params.mont_one, one->c0); + sike_fp2add(xP, xQ, t1); // t1 = xP+xQ + sike_fp2mul_mont(xP, xQ, t0); // t0 = xP*xQ + sike_fp2mul_mont(xR, t1, A); // A = xR*t1 + sike_fp2add(t0, A, A); // A = A+t0 + sike_fp2mul_mont(t0, xR, t0); // t0 = t0*xR + sike_fp2sub(A, one, A); // A = A-1 + sike_fp2add(t0, t0, t0); // t0 = t0+t0 + sike_fp2add(t1, xR, t1); // t1 = t1+xR + sike_fp2add(t0, t0, t0); // t0 = t0+t0 + sike_fp2sqr_mont(A, A); // A = A^2 + sike_fp2inv_mont(t0); // t0 = 1/t0 + sike_fp2mul_mont(A, t0, A); // A = A*t0 + sike_fp2sub(A, t1, A); // Afinal = A-t1 +} + + +void j_inv(const f2elm_t A, const f2elm_t C, f2elm_t jinv) +{ // Computes the j-invariant of a Montgomery curve with projective constant. + // Input: A,C in GF(p^2). + // Output: j=256*(A^2-3*C^2)^3/(C^4*(A^2-4*C^2)), which is the j-invariant of the Montgomery curve B*y^2=x^3+(A/C)*x^2+x or (equivalently) j-invariant of B'*y^2=C*x^3+A*x^2+C*x. + f2elm_t t0, t1; + + sike_fp2sqr_mont(A, jinv); // jinv = A^2 + sike_fp2sqr_mont(C, t1); // t1 = C^2 + sike_fp2add(t1, t1, t0); // t0 = t1+t1 + sike_fp2sub(jinv, t0, t0); // t0 = jinv-t0 + sike_fp2sub(t0, t1, t0); // t0 = t0-t1 + sike_fp2sub(t0, t1, jinv); // jinv = t0-t1 + sike_fp2sqr_mont(t1, t1); // t1 = t1^2 + sike_fp2mul_mont(jinv, t1, jinv); // jinv = jinv*t1 + sike_fp2add(t0, t0, t0); // t0 = t0+t0 + sike_fp2add(t0, t0, t0); // t0 = t0+t0 + sike_fp2sqr_mont(t0, t1); // t1 = t0^2 + sike_fp2mul_mont(t0, t1, t0); // t0 = t0*t1 + sike_fp2add(t0, t0, t0); // t0 = t0+t0 + sike_fp2add(t0, t0, t0); // t0 = t0+t0 + sike_fp2inv_mont(jinv); // jinv = 1/jinv + sike_fp2mul_mont(jinv, t0, jinv); // jinv = t0*jinv +} + + +void xDBLADD(point_proj_t P, point_proj_t Q, const f2elm_t xPQ, const f2elm_t A24) +{ // Simultaneous doubling and differential addition. + // Input: projective Montgomery points P=(XP:ZP) and Q=(XQ:ZQ) such that xP=XP/ZP and xQ=XQ/ZQ, affine difference xPQ=x(P-Q) and Montgomery curve constant A24=(A+2)/4. + // Output: projective Montgomery points P <- 2*P = (X2P:Z2P) such that x(2P)=X2P/Z2P, and Q <- P+Q = (XQP:ZQP) such that = x(Q+P)=XQP/ZQP. + f2elm_t t0, t1, t2; + + sike_fp2add(P->X, P->Z, t0); // t0 = XP+ZP + sike_fp2sub(P->X, P->Z, t1); // t1 = XP-ZP + sike_fp2sqr_mont(t0, P->X); // XP = (XP+ZP)^2 + sike_fp2sub(Q->X, Q->Z, t2); // t2 = XQ-ZQ + sike_fp2correction(t2); + sike_fp2add(Q->X, Q->Z, Q->X); // XQ = XQ+ZQ + sike_fp2mul_mont(t0, t2, t0); // t0 = (XP+ZP)*(XQ-ZQ) + sike_fp2sqr_mont(t1, P->Z); // ZP = (XP-ZP)^2 + sike_fp2mul_mont(t1, Q->X, t1); // t1 = (XP-ZP)*(XQ+ZQ) + sike_fp2sub(P->X, P->Z, t2); // t2 = (XP+ZP)^2-(XP-ZP)^2 + sike_fp2mul_mont(P->X, P->Z, P->X); // XP = (XP+ZP)^2*(XP-ZP)^2 + sike_fp2mul_mont(t2, A24, Q->X); // XQ = A24*[(XP+ZP)^2-(XP-ZP)^2] + sike_fp2sub(t0, t1, Q->Z); // ZQ = (XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ) + sike_fp2add(Q->X, P->Z, P->Z); // ZP = A24*[(XP+ZP)^2-(XP-ZP)^2]+(XP-ZP)^2 + sike_fp2add(t0, t1, Q->X); // XQ = (XP+ZP)*(XQ-ZQ)+(XP-ZP)*(XQ+ZQ) + sike_fp2mul_mont(P->Z, t2, P->Z); // ZP = [A24*[(XP+ZP)^2-(XP-ZP)^2]+(XP-ZP)^2]*[(XP+ZP)^2-(XP-ZP)^2] + sike_fp2sqr_mont(Q->Z, Q->Z); // ZQ = [(XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)]^2 + sike_fp2sqr_mont(Q->X, Q->X); // XQ = [(XP+ZP)*(XQ-ZQ)+(XP-ZP)*(XQ+ZQ)]^2 + sike_fp2mul_mont(Q->Z, xPQ, Q->Z); // ZQ = xPQ*[(XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)]^2 +} diff --git a/src/kem/sike/p434/isogeny.h b/src/kem/sike/p434/isogeny.h new file mode 100644 index 00000000..460c8c66 --- /dev/null +++ b/src/kem/sike/p434/isogeny.h @@ -0,0 +1,49 @@ +#ifndef ISOGENY_H_ +#define ISOGENY_H_ + +// Computes [2^e](X:Z) on Montgomery curve with projective +// constant via e repeated doublings. +void xDBLe( + const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, + const f2elm_t C24, size_t e); +// Simultaneous doubling and differential addition. +void xDBLADD( + point_proj_t P, point_proj_t Q, const f2elm_t xPQ, + const f2elm_t A24); +// Tripling of a Montgomery point in projective coordinates (X:Z). +void xTPL( + const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, + const f2elm_t A24plus); +// Computes [3^e](X:Z) on Montgomery curve with projective constant +// via e repeated triplings. +void xTPLe( + const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, + const f2elm_t A24plus, size_t e); +// Given the x-coordinates of P, Q, and R, returns the value A +// corresponding to the Montgomery curve E_A: y^2=x^3+A*x^2+x such that R=Q-P on E_A. +void get_A( + const f2elm_t xP, const f2elm_t xQ, const f2elm_t xR, f2elm_t A); +// Computes the j-invariant of a Montgomery curve with projective constant. +void j_inv( + const f2elm_t A, const f2elm_t C, f2elm_t jinv); +// Computes the corresponding 4-isogeny of a projective Montgomery +// point (X4:Z4) of order 4. +void get_4_isog( + const point_proj_t P, f2elm_t A24plus, f2elm_t C24, f2elm_t* coeff); +// Computes the corresponding 3-isogeny of a projective Montgomery +// point (X3:Z3) of order 3. +void get_3_isog( + const point_proj_t P, f2elm_t A24minus, f2elm_t A24plus, + f2elm_t* coeff); +// Computes the 3-isogeny R=phi(X:Z), given projective point (X3:Z3) +// of order 3 on a Montgomery curve and a point P with coefficients given in coeff. +void eval_3_isog( + point_proj_t Q, f2elm_t* coeff); +// Evaluates the isogeny at the point (X:Z) in the domain of the isogeny. +void eval_4_isog( + point_proj_t P, f2elm_t* coeff); +// 3-way simultaneous inversion +void inv_3_way( + f2elm_t z1, f2elm_t z2, f2elm_t z3); + +#endif // ISOGENY_H_ diff --git a/src/kem/sike/p434/params.c b/src/kem/sike/p434/params.c new file mode 100644 index 00000000..b13f4c87 --- /dev/null +++ b/src/kem/sike/p434/params.c @@ -0,0 +1,128 @@ +/******************************************************************************************** +* SIDH: an efficient supersingular isogeny cryptography library +* +* Abstract: supersingular isogeny parameters and generation of functions for P434 +*********************************************************************************************/ + +#include "utils.h" + +// Parameters for isogeny system "SIKE" +const struct params_t params = { + .prime = { + U64_TO_WORDS(0xFFFFFFFFFFFFFFFF), U64_TO_WORDS(0xFFFFFFFFFFFFFFFF), + U64_TO_WORDS(0xFFFFFFFFFFFFFFFF), U64_TO_WORDS(0xFDC1767AE2FFFFFF), + U64_TO_WORDS(0x7BC65C783158AEA3), U64_TO_WORDS(0x6CFC5FD681C52056), + U64_TO_WORDS(0x0002341F27177344) + }, + .prime_p1 = { + U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0x0000000000000000), + U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0xFDC1767AE3000000), + U64_TO_WORDS(0x7BC65C783158AEA3), U64_TO_WORDS(0x6CFC5FD681C52056), + U64_TO_WORDS(0x0002341F27177344) + }, + .prime_x2 = { + U64_TO_WORDS(0xFFFFFFFFFFFFFFFE), U64_TO_WORDS(0xFFFFFFFFFFFFFFFF), + U64_TO_WORDS(0xFFFFFFFFFFFFFFFF), U64_TO_WORDS(0xFB82ECF5C5FFFFFF), + U64_TO_WORDS(0xF78CB8F062B15D47), U64_TO_WORDS(0xD9F8BFAD038A40AC), + U64_TO_WORDS(0x0004683E4E2EE688) + }, + .A_gen = { + U64_TO_WORDS(0x05ADF455C5C345BF), U64_TO_WORDS(0x91935C5CC767AC2B), + U64_TO_WORDS(0xAFE4E879951F0257), U64_TO_WORDS(0x70E792DC89FA27B1), + U64_TO_WORDS(0xF797F526BB48C8CD), U64_TO_WORDS(0x2181DB6131AF621F), + U64_TO_WORDS(0x00000A1C08B1ECC4), // XPA0 + U64_TO_WORDS(0x74840EB87CDA7788), U64_TO_WORDS(0x2971AA0ECF9F9D0B), + U64_TO_WORDS(0xCB5732BDF41715D5), U64_TO_WORDS(0x8CD8E51F7AACFFAA), + U64_TO_WORDS(0xA7F424730D7E419F), U64_TO_WORDS(0xD671EB919A179E8C), + U64_TO_WORDS(0x0000FFA26C5A924A), // XPA1 + U64_TO_WORDS(0xFEC6E64588B7273B), U64_TO_WORDS(0xD2A626D74CBBF1C6), + U64_TO_WORDS(0xF8F58F07A78098C7), U64_TO_WORDS(0xE23941F470841B03), + U64_TO_WORDS(0x1B63EDA2045538DD), U64_TO_WORDS(0x735CFEB0FFD49215), + U64_TO_WORDS(0x0001C4CB77542876), // XQA0 + U64_TO_WORDS(0xADB0F733C17FFDD6), U64_TO_WORDS(0x6AFFBD037DA0A050), + U64_TO_WORDS(0x680EC43DB144E02F), U64_TO_WORDS(0x1E2E5D5FF524E374), + U64_TO_WORDS(0xE2DDA115260E2995), U64_TO_WORDS(0xA6E4B552E2EDE508), + U64_TO_WORDS(0x00018ECCDDF4B53E), // XQA1 + U64_TO_WORDS(0x01BA4DB518CD6C7D), U64_TO_WORDS(0x2CB0251FE3CC0611), + U64_TO_WORDS(0x259B0C6949A9121B), U64_TO_WORDS(0x60E17AC16D2F82AD), + U64_TO_WORDS(0x3AA41F1CE175D92D), U64_TO_WORDS(0x413FBE6A9B9BC4F3), + U64_TO_WORDS(0x00022A81D8D55643), // XRA0 + U64_TO_WORDS(0xB8ADBC70FC82E54A), U64_TO_WORDS(0xEF9CDDB0D5FADDED), + U64_TO_WORDS(0x5820C734C80096A0), U64_TO_WORDS(0x7799994BAA96E0E4), + U64_TO_WORDS(0x044961599E379AF8), U64_TO_WORDS(0xDB2B94FBF09F27E2), + U64_TO_WORDS(0x0000B87FC716C0C6) // XRA1 + }, + .B_gen = { + U64_TO_WORDS(0x6E5497556EDD48A3), U64_TO_WORDS(0x2A61B501546F1C05), + U64_TO_WORDS(0xEB919446D049887D), U64_TO_WORDS(0x5864A4A69D450C4F), + U64_TO_WORDS(0xB883F276A6490D2B), U64_TO_WORDS(0x22CC287022D5F5B9), + U64_TO_WORDS(0x0001BED4772E551F), // XPB0 + U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0x0000000000000000), + U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0x0000000000000000), + U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0x0000000000000000), + U64_TO_WORDS(0x0000000000000000), // XPB1 + U64_TO_WORDS(0xFAE2A3F93D8B6B8E), U64_TO_WORDS(0x494871F51700FE1C), + U64_TO_WORDS(0xEF1A94228413C27C), U64_TO_WORDS(0x498FF4A4AF60BD62), + U64_TO_WORDS(0xB00AD2A708267E8A), U64_TO_WORDS(0xF4328294E017837F), + U64_TO_WORDS(0x000034080181D8AE), // XQB0 + U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0x0000000000000000), + U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0x0000000000000000), + U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0x0000000000000000), + U64_TO_WORDS(0x0000000000000000), // XQB1 + U64_TO_WORDS(0x283B34FAFEFDC8E4), U64_TO_WORDS(0x9208F44977C3E647), + U64_TO_WORDS(0x7DEAE962816F4E9A), U64_TO_WORDS(0x68A2BA8AA262EC9D), + U64_TO_WORDS(0x8176F112EA43F45B), U64_TO_WORDS(0x02106D022634F504), + U64_TO_WORDS(0x00007E8A50F02E37), // XRB0 + U64_TO_WORDS(0xB378B7C1DA22CCB1), U64_TO_WORDS(0x6D089C99AD1D9230), + U64_TO_WORDS(0xEBE15711813E2369), U64_TO_WORDS(0x2B35A68239D48A53), + U64_TO_WORDS(0x445F6FD138407C93), U64_TO_WORDS(0xBEF93B29A3F6B54B), + U64_TO_WORDS(0x000173FA910377D3) // XRB1 + }, + .mont_R2 = { + U64_TO_WORDS(0x28E55B65DCD69B30), U64_TO_WORDS(0xACEC7367768798C2), + U64_TO_WORDS(0xAB27973F8311688D), U64_TO_WORDS(0x175CC6AF8D6C7C0B), + U64_TO_WORDS(0xABCD92BF2DDE347E), U64_TO_WORDS(0x69E16A61C7686D9A), + U64_TO_WORDS(0x000025A89BCDD12A) + }, + .mont_one = { + U64_TO_WORDS(0x000000000000742C), U64_TO_WORDS(0x0000000000000000), + U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0xB90FF404FC000000), + U64_TO_WORDS(0xD801A4FB559FACD4), U64_TO_WORDS(0xE93254545F77410C), + U64_TO_WORDS(0x0000ECEEA7BD2EDA) + }, + .mont_six = { + U64_TO_WORDS(0x000000000002B90A), U64_TO_WORDS(0x0000000000000000), + U64_TO_WORDS(0x0000000000000000), U64_TO_WORDS(0x5ADCCB2822000000), + U64_TO_WORDS(0x187D24F39F0CAFB4), U64_TO_WORDS(0x9D353A4D394145A0), + U64_TO_WORDS(0x00012559A0403298) + }, + .A_strat = { + 0x30, 0x1C, 0x10, 0x08, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, + 0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x08, 0x04, + 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, + 0x02, 0x01, 0x01, 0x0D, 0x07, 0x04, 0x02, 0x01, 0x01, 0x02, + 0x01, 0x01, 0x03, 0x02, 0x01, 0x01, 0x01, 0x01, 0x05, 0x04, + 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x01, + 0x15, 0x0C, 0x07, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, + 0x03, 0x02, 0x01, 0x01, 0x01, 0x01, 0x05, 0x03, 0x02, 0x01, + 0x01, 0x01, 0x01, 0x02, 0x01, 0x01, 0x01, 0x09, 0x05, 0x03, + 0x02, 0x01, 0x01, 0x01, 0x01, 0x02, 0x01, 0x01, 0x01, 0x04, + 0x02, 0x01, 0x01, 0x01, 0x02, 0x01, 0x01 + }, + .B_strat = { + 0x42, 0x21, 0x11, 0x09, 0x05, 0x03, 0x02, 0x01, 0x01, 0x01, + 0x01, 0x02, 0x01, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x01, + 0x02, 0x01, 0x01, 0x08, 0x04, 0x02, 0x01, 0x01, 0x01, 0x02, + 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x10, + 0x08, 0x04, 0x02, 0x01, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04, + 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x08, 0x04, 0x02, 0x01, + 0x01, 0x02, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, + 0x01, 0x20, 0x10, 0x08, 0x04, 0x03, 0x01, 0x01, 0x01, 0x01, + 0x02, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, + 0x08, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04, 0x02, + 0x01, 0x01, 0x02, 0x01, 0x01, 0x10, 0x08, 0x04, 0x02, 0x01, + 0x01, 0x02, 0x01, 0x01, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, + 0x01, 0x08, 0x04, 0x02, 0x01, 0x01, 0x02, 0x01, 0x01, 0x04, + 0x02, 0x01, 0x01, 0x02, 0x01, 0x01 + } +}; diff --git a/src/kem/sike/p434/sike.c b/src/kem/sike/p434/sike.c new file mode 100644 index 00000000..f52fe5c4 --- /dev/null +++ b/src/kem/sike/p434/sike.c @@ -0,0 +1,522 @@ +/******************************************************************************************** +* SIDH: an efficient supersingular isogeny cryptography library +* +* Abstract: supersingular isogeny key encapsulation (SIKE) protocol +*********************************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include "utils.h" +#include "isogeny.h" +#include "fpx.h" + +extern const struct params_t params; + +// SIDH_JINV_BYTESZ is a number of bytes used for encoding j-invariant. +#define SIDH_JINV_BYTESZ 110U +// SIDH_PRV_A_BITSZ is a number of bits of SIDH private key (2-isogeny) +#define SIDH_PRV_A_BITSZ 216U +// SIDH_PRV_A_BITSZ is a number of bits of SIDH private key (3-isogeny) +#define SIDH_PRV_B_BITSZ 217U +// MAX_INT_POINTS_ALICE is a number of points used in 2-isogeny tree computation +#define MAX_INT_POINTS_ALICE 7U +// MAX_INT_POINTS_ALICE is a number of points used in 3-isogeny tree computation +#define MAX_INT_POINTS_BOB 8U + +// Swap points. +// If option = 0 then P <- P and Q <- Q, else if option = 0xFF...FF then P <- Q and Q <- P +#if !defined(ARCH_X86_64) || defined(ARCH_GENERIC) +static void sike_cswap(point_proj_t P, point_proj_t Q, const crypto_word_t option) +{ + crypto_word_t temp; + for (size_t i = 0; i < NWORDS_FIELD; i++) { + temp = option & (P->X->c0[i] ^ Q->X->c0[i]); + P->X->c0[i] = temp ^ P->X->c0[i]; + Q->X->c0[i] = temp ^ Q->X->c0[i]; + temp = option & (P->Z->c0[i] ^ Q->Z->c0[i]); + P->Z->c0[i] = temp ^ P->Z->c0[i]; + Q->Z->c0[i] = temp ^ Q->Z->c0[i]; + temp = option & (P->X->c1[i] ^ Q->X->c1[i]); + P->X->c1[i] = temp ^ P->X->c1[i]; + Q->X->c1[i] = temp ^ Q->X->c1[i]; + temp = option & (P->Z->c1[i] ^ Q->Z->c1[i]); + P->Z->c1[i] = temp ^ P->Z->c1[i]; + Q->Z->c1[i] = temp ^ Q->Z->c1[i]; + } +} +#endif + +// Swap points. +// If option = 0 then P <- P and Q <- Q, else if option = 0xFF...FF then P <- Q and Q <- P +static inline void sike_fp2cswap(point_proj_t P, point_proj_t Q, const crypto_word_t option) +{ +#if defined(ARCH_X86_64) && !defined(ARCH_GENERIC) + sike_cswap_asm(P, Q, option); +#else + sike_cswap(P, Q, option); +#endif +} + +static void ladder3Pt( + const f2elm_t xP, const f2elm_t xQ, const f2elm_t xPQ, const uint8_t* m, + int is_A, point_proj_t R, const f2elm_t A) { + point_proj_t R0 = POINT_PROJ_INIT, R2 = POINT_PROJ_INIT; + f2elm_t A24 = F2ELM_INIT; + crypto_word_t mask; + int bit, swap, prevbit = 0; + + const size_t nbits = is_A?SIDH_PRV_A_BITSZ:SIDH_PRV_B_BITSZ; + + // Initializing constant + sike_fpcopy(params.mont_one, A24[0].c0); + sike_fp2add(A24, A24, A24); + sike_fp2add(A, A24, A24); + sike_fp2div2(A24, A24); + sike_fp2div2(A24, A24); // A24 = (A+2)/4 + + // Initializing points + sike_fp2copy(xQ, R0->X); + sike_fpcopy(params.mont_one, R0->Z[0].c0); + sike_fp2copy(xPQ, R2->X); + sike_fpcopy(params.mont_one, R2->Z[0].c0); + sike_fp2copy(xP, R->X); + sike_fpcopy(params.mont_one, R->Z[0].c0); + memset(R->Z->c1, 0, sizeof(R->Z->c1)); + + // Main loop + for (size_t i = 0; i < nbits; i++) { + bit = (m[i >> 3] >> (i & 7)) & 1; + swap = bit ^ prevbit; + prevbit = bit; + mask = 0 - (crypto_word_t)swap; + + sike_fp2cswap(R, R2, mask); + xDBLADD(R0, R2, R->X, A24); + sike_fp2mul_mont(R2->X, R->Z, R2->X); + } + swap = 0 ^ prevbit; + mask = 0 - (crypto_word_t)swap; + sike_fp2cswap(R, R2, mask); +} + +// Initialization of basis points +static inline void sike_init_basis(const crypto_word_t *gen, f2elm_t XP, f2elm_t XQ, f2elm_t XR) { + sike_fpcopy(gen, XP->c0); + sike_fpcopy(gen + NWORDS_FIELD, XP->c1); + sike_fpcopy(gen + 2*NWORDS_FIELD, XQ->c0); + sike_fpcopy(gen + 3*NWORDS_FIELD, XQ->c1); + sike_fpcopy(gen + 4*NWORDS_FIELD, XR->c0); + sike_fpcopy(gen + 5*NWORDS_FIELD, XR->c1); +} + +// Conversion of GF(p^2) element from Montgomery to standard representation. +static inline void sike_fp2_encode(const f2elm_t x, uint8_t *enc) { + f2elm_t t; + sike_from_fp2mont(x, t); + + // convert to bytes in little endian form + for (size_t i=0; i> (8*(i%LSZ))) & 0xFF; + enc[i+FIELD_BYTESZ] = (t[0].c1[i/LSZ] >> (8*(i%LSZ))) & 0xFF; + } +} + +// Parse byte sequence back into GF(p^2) element, and conversion to Montgomery representation. +// Elements over GF(p503) are encoded in 63 octets in little endian format +// (i.e., the least significant octet is located in the lowest memory address). +static inline void fp2_decode(const uint8_t *enc, f2elm_t t) { + memset(t[0].c0, 0, sizeof(t[0].c0)); + memset(t[0].c1, 0, sizeof(t[0].c1)); + // convert bytes in little endian form to f2elm_t + for (size_t i = 0; i < FIELD_BYTESZ; i++) { + t[0].c0[i/LSZ] |= ((crypto_word_t)enc[i+ 0]) << (8*(i%LSZ)); + t[0].c1[i/LSZ] |= ((crypto_word_t)enc[i+FIELD_BYTESZ]) << (8*(i%LSZ)); + } + sike_to_fp2mont(t, t); +} + +// Alice's ephemeral public key generation +// Input: a private key prA in the range [0, 2^250 - 1], stored in 32 bytes. +// Output: the public key pkA consisting of 3 GF(p503^2) elements encoded in 378 bytes. +static void gen_iso_A(const uint8_t* skA, uint8_t* pkA) +{ + point_proj_t R, pts[MAX_INT_POINTS_ALICE]; + point_proj_t phiP = POINT_PROJ_INIT; + point_proj_t phiQ = POINT_PROJ_INIT; + point_proj_t phiR = POINT_PROJ_INIT; + f2elm_t XPA, XQA, XRA, coeff[3]; + f2elm_t A24plus = F2ELM_INIT; + f2elm_t C24 = F2ELM_INIT; + f2elm_t A = F2ELM_INIT; + unsigned int m, index = 0, pts_index[MAX_INT_POINTS_ALICE], npts = 0, ii = 0; + + // Initialize basis points + sike_init_basis(params.A_gen, XPA, XQA, XRA); + sike_init_basis(params.B_gen, phiP->X, phiQ->X, phiR->X); + sike_fpcopy(params.mont_one, (phiP->Z)->c0); + sike_fpcopy(params.mont_one, (phiQ->Z)->c0); + sike_fpcopy(params.mont_one, (phiR->Z)->c0); + + // Initialize constants: A24plus = A+2C, C24 = 4C, where A=6, C=1 + sike_fpcopy(params.mont_one, A24plus->c0); + sike_fp2add(A24plus, A24plus, A24plus); + sike_fp2add(A24plus, A24plus, C24); + sike_fp2add(A24plus, C24, A); + sike_fp2add(C24, C24, A24plus); + + // Retrieve kernel point + ladder3Pt(XPA, XQA, XRA, skA, 1, R, A); + + // Traverse tree + index = 0; + for (size_t row = 1; row < A_max; row++) { + while (index < A_max-row) { + sike_fp2copy(R->X, pts[npts]->X); + sike_fp2copy(R->Z, pts[npts]->Z); + pts_index[npts++] = index; + m = params.A_strat[ii++]; + xDBLe(R, R, A24plus, C24, (2*m)); + index += m; + } + get_4_isog(R, A24plus, C24, coeff); + + for (size_t i = 0; i < npts; i++) { + eval_4_isog(pts[i], coeff); + } + eval_4_isog(phiP, coeff); + eval_4_isog(phiQ, coeff); + eval_4_isog(phiR, coeff); + + sike_fp2copy(pts[npts-1]->X, R->X); + sike_fp2copy(pts[npts-1]->Z, R->Z); + index = pts_index[npts-1]; + npts -= 1; + } + + get_4_isog(R, A24plus, C24, coeff); + eval_4_isog(phiP, coeff); + eval_4_isog(phiQ, coeff); + eval_4_isog(phiR, coeff); + + inv_3_way(phiP->Z, phiQ->Z, phiR->Z); + sike_fp2mul_mont(phiP->X, phiP->Z, phiP->X); + sike_fp2mul_mont(phiQ->X, phiQ->Z, phiQ->X); + sike_fp2mul_mont(phiR->X, phiR->Z, phiR->X); + + // Format public key + sike_fp2_encode(phiP->X, pkA); + sike_fp2_encode(phiQ->X, pkA + SIDH_JINV_BYTESZ); + sike_fp2_encode(phiR->X, pkA + 2*SIDH_JINV_BYTESZ); +} + +// Bob's ephemeral key-pair generation +// It produces a private key skB and computes the public key pkB. +// The private key is an integer in the range [0, 2^Floor(Log(2,3^159)) - 1], stored in 32 bytes. +// The public key consists of 3 GF(p503^2) elements encoded in 378 bytes. +static void gen_iso_B(const uint8_t* skB, uint8_t* pkB) +{ + point_proj_t R, pts[MAX_INT_POINTS_BOB]; + point_proj_t phiP = POINT_PROJ_INIT; + point_proj_t phiQ = POINT_PROJ_INIT; + point_proj_t phiR = POINT_PROJ_INIT; + f2elm_t XPB, XQB, XRB, coeff[3]; + f2elm_t A24plus = F2ELM_INIT; + f2elm_t A24minus = F2ELM_INIT; + f2elm_t A = F2ELM_INIT; + unsigned int m, index = 0, pts_index[MAX_INT_POINTS_BOB], npts = 0, ii = 0; + + // Initialize basis points + sike_init_basis(params.B_gen, XPB, XQB, XRB); + sike_init_basis(params.A_gen, phiP->X, phiQ->X, phiR->X); + sike_fpcopy(params.mont_one, (phiP->Z)->c0); + sike_fpcopy(params.mont_one, (phiQ->Z)->c0); + sike_fpcopy(params.mont_one, (phiR->Z)->c0); + + // Initialize constants: A24minus = A-2C, A24plus = A+2C, where A=6, C=1 + sike_fpcopy(params.mont_one, A24plus->c0); + sike_fp2add(A24plus, A24plus, A24plus); + sike_fp2add(A24plus, A24plus, A24minus); + sike_fp2add(A24plus, A24minus, A); + sike_fp2add(A24minus, A24minus, A24plus); + + // Retrieve kernel point + ladder3Pt(XPB, XQB, XRB, skB, 0, R, A); + + // Traverse tree + index = 0; + for (size_t row = 1; row < B_max; row++) { + while (index < B_max-row) { + sike_fp2copy(R->X, pts[npts]->X); + sike_fp2copy(R->Z, pts[npts]->Z); + pts_index[npts++] = index; + m = params.B_strat[ii++]; + xTPLe(R, R, A24minus, A24plus, m); + index += m; + } + get_3_isog(R, A24minus, A24plus, coeff); + + for (size_t i = 0; i < npts; i++) { + eval_3_isog(pts[i], coeff); + } + eval_3_isog(phiP, coeff); + eval_3_isog(phiQ, coeff); + eval_3_isog(phiR, coeff); + + sike_fp2copy(pts[npts-1]->X, R->X); + sike_fp2copy(pts[npts-1]->Z, R->Z); + index = pts_index[npts-1]; + npts -= 1; + } + + get_3_isog(R, A24minus, A24plus, coeff); + eval_3_isog(phiP, coeff); + eval_3_isog(phiQ, coeff); + eval_3_isog(phiR, coeff); + + inv_3_way(phiP->Z, phiQ->Z, phiR->Z); + sike_fp2mul_mont(phiP->X, phiP->Z, phiP->X); + sike_fp2mul_mont(phiQ->X, phiQ->Z, phiQ->X); + sike_fp2mul_mont(phiR->X, phiR->Z, phiR->X); + + // Format public key + sike_fp2_encode(phiP->X, pkB); + sike_fp2_encode(phiQ->X, pkB + SIDH_JINV_BYTESZ); + sike_fp2_encode(phiR->X, pkB + 2*SIDH_JINV_BYTESZ); +} + +// Alice's ephemeral shared secret computation +// It produces a shared secret key ssA using her secret key skA and Bob's public key pkB +// Inputs: Alice's skA is an integer in the range [0, 2^250 - 1], stored in 32 bytes. +// Bob's pkB consists of 3 GF(p503^2) elements encoded in 378 bytes. +// Output: a shared secret ssA that consists of one element in GF(p503^2) encoded in 126 bytes. +static void ex_iso_A(const uint8_t* skA, const uint8_t* pkB, uint8_t* ssA) +{ + point_proj_t R, pts[MAX_INT_POINTS_ALICE]; + f2elm_t coeff[3], PKB[3], jinv; + f2elm_t A24plus = F2ELM_INIT; + f2elm_t C24 = F2ELM_INIT; + f2elm_t A = F2ELM_INIT; + unsigned int m, index = 0, pts_index[MAX_INT_POINTS_ALICE], npts = 0, ii = 0; + + // Initialize images of Bob's basis + fp2_decode(pkB, PKB[0]); + fp2_decode(pkB + SIDH_JINV_BYTESZ, PKB[1]); + fp2_decode(pkB + 2*SIDH_JINV_BYTESZ, PKB[2]); + + // Initialize constants + get_A(PKB[0], PKB[1], PKB[2], A); + sike_fpadd(params.mont_one, params.mont_one, C24->c0); + sike_fp2add(A, C24, A24plus); + sike_fpadd(C24->c0, C24->c0, C24->c0); + + // Retrieve kernel point + ladder3Pt(PKB[0], PKB[1], PKB[2], skA, 1, R, A); + + // Traverse tree + index = 0; + for (size_t row = 1; row < A_max; row++) { + while (index < A_max-row) { + sike_fp2copy(R->X, pts[npts]->X); + sike_fp2copy(R->Z, pts[npts]->Z); + pts_index[npts++] = index; + m = params.A_strat[ii++]; + xDBLe(R, R, A24plus, C24, (2*m)); + index += m; + } + get_4_isog(R, A24plus, C24, coeff); + + for (size_t i = 0; i < npts; i++) { + eval_4_isog(pts[i], coeff); + } + + sike_fp2copy(pts[npts-1]->X, R->X); + sike_fp2copy(pts[npts-1]->Z, R->Z); + index = pts_index[npts-1]; + npts -= 1; + } + + get_4_isog(R, A24plus, C24, coeff); + sike_fp2add(A24plus, A24plus, A24plus); + sike_fp2sub(A24plus, C24, A24plus); + sike_fp2add(A24plus, A24plus, A24plus); + j_inv(A24plus, C24, jinv); + sike_fp2_encode(jinv, ssA); +} + +// Bob's ephemeral shared secret computation +// It produces a shared secret key ssB using his secret key skB and Alice's public key pkA +// Inputs: Bob's skB is an integer in the range [0, 2^Floor(Log(2,3^159)) - 1], stored in 32 bytes. +// Alice's pkA consists of 3 GF(p503^2) elements encoded in 378 bytes. +// Output: a shared secret ssB that consists of one element in GF(p503^2) encoded in 126 bytes. +static void ex_iso_B(const uint8_t* skB, const uint8_t* pkA, uint8_t* ssB) +{ + point_proj_t R, pts[MAX_INT_POINTS_BOB]; + f2elm_t coeff[3], PKB[3], jinv; + f2elm_t A24plus = F2ELM_INIT; + f2elm_t A24minus = F2ELM_INIT; + f2elm_t A = F2ELM_INIT; + unsigned int m, index = 0, pts_index[MAX_INT_POINTS_BOB], npts = 0, ii = 0; + + // Initialize images of Alice's basis + fp2_decode(pkA, PKB[0]); + fp2_decode(pkA + SIDH_JINV_BYTESZ, PKB[1]); + fp2_decode(pkA + 2*SIDH_JINV_BYTESZ, PKB[2]); + + // Initialize constants + get_A(PKB[0], PKB[1], PKB[2], A); + sike_fpadd(params.mont_one, params.mont_one, A24minus->c0); + sike_fp2add(A, A24minus, A24plus); + sike_fp2sub(A, A24minus, A24minus); + + // Retrieve kernel point + ladder3Pt(PKB[0], PKB[1], PKB[2], skB, 0, R, A); + + // Traverse tree + index = 0; + for (size_t row = 1; row < B_max; row++) { + while (index < B_max-row) { + sike_fp2copy(R->X, pts[npts]->X); + sike_fp2copy(R->Z, pts[npts]->Z); + pts_index[npts++] = index; + m = params.B_strat[ii++]; + xTPLe(R, R, A24minus, A24plus, m); + index += m; + } + get_3_isog(R, A24minus, A24plus, coeff); + + for (size_t i = 0; i < npts; i++) { + eval_3_isog(pts[i], coeff); + } + + sike_fp2copy(pts[npts-1]->X, R->X); + sike_fp2copy(pts[npts-1]->Z, R->Z); + index = pts_index[npts-1]; + npts -= 1; + } + + get_3_isog(R, A24minus, A24plus, coeff); + sike_fp2add(A24plus, A24minus, A); + sike_fp2add(A, A, A); + sike_fp2sub(A24plus, A24minus, A24plus); + j_inv(A, A24plus, jinv); + sike_fp2_encode(jinv, ssB); +} + +int SIKE_keypair(uint8_t out_priv[SIKE_PRV_BYTESZ], + uint8_t out_pub[SIKE_PUB_BYTESZ]) { + // Calculate private key for Alice. Needs to be in range [0, 2^0xFA - 1] and < + // 253 bits + randombytes(out_priv, SIKE_PRV_BYTESZ); + out_priv[31] = (out_priv[31] | 0x01) & 0x03; + + gen_iso_B(out_priv, out_pub); + return 1; +} + +void SIKE_encaps(uint8_t out_shared_key[SIKE_SS_BYTESZ], + uint8_t out_ciphertext[SIKE_CT_BYTESZ], + const uint8_t pub_key[SIKE_PUB_BYTESZ]) { + // Secret buffer is reused by the function to store some ephemeral + // secret data. It's size must be maximum of 64, + // SIKE_MSG_BYTESZ and SIDH_PRV_A_BITSZ in bytes. + uint8_t secret[32]; // OZAPTF, why? + uint8_t j[SIDH_JINV_BYTESZ]; + uint8_t temp[SIKE_MSG_BYTESZ + SIKE_CT_BYTESZ]; + shake256incctx ctx; + + // Generate secret key for A + // secret key A = SHA256({0,1}^n || pub_key)) mod SIDH_PRV_A_BITSZ + randombytes(temp, SIKE_MSG_BYTESZ); + + shake256_inc_init(&ctx); + shake256_inc_absorb(&ctx, temp, SIKE_MSG_BYTESZ); + shake256_inc_absorb(&ctx, pub_key, SIKE_PUB_BYTESZ); + shake256_inc_finalize(&ctx); + shake256_inc_squeeze(secret, 32, &ctx); + shake256_inc_ctx_release(&ctx); + + // Generate public key for A - first part of the ciphertext + gen_iso_A(secret, out_ciphertext); + + // Generate c1: + // h = SHA256(j-invariant) + // c1 = h ^ m + ex_iso_A(secret, pub_key, j); + shake256(secret, sizeof secret, j, sizeof j); + + // c1 = h ^ m + uint8_t *c1 = &out_ciphertext[SIKE_PUB_BYTESZ]; + for (size_t i = 0; i < SIKE_MSG_BYTESZ; i++) { + c1[i] = temp[i] ^ secret[i]; + } + + shake256_inc_init(&ctx); + shake256_inc_absorb(&ctx, temp, SIKE_MSG_BYTESZ); + shake256_inc_absorb(&ctx, out_ciphertext, SIKE_CT_BYTESZ); + shake256_inc_finalize(&ctx); + shake256_inc_squeeze(secret, 32, &ctx); + shake256_inc_ctx_release(&ctx); + // Generate shared secret out_shared_key = SHA256(m||out_ciphertext) + memcpy(out_shared_key, secret, SIKE_SS_BYTESZ); +} + +void SIKE_decaps(uint8_t out_shared_key[SIKE_SS_BYTESZ], + const uint8_t ciphertext[SIKE_CT_BYTESZ], + const uint8_t pub_key[SIKE_PUB_BYTESZ], + const uint8_t priv_key[SIKE_PRV_BYTESZ]) { + // Secret buffer is reused by the function to store some ephemeral + // secret data. It's size must be maximum of 64, + // SIKE_MSG_BYTESZ and SIDH_PRV_A_BITSZ in bytes. + uint8_t secret[32]; + uint8_t j[SIDH_JINV_BYTESZ]; + uint8_t c0[SIKE_PUB_BYTESZ]; + uint8_t temp[SIKE_MSG_BYTESZ]; + uint8_t shared_nok[SIKE_MSG_BYTESZ]; + shake256incctx ctx; + + // This is OK as we are only using ephemeral keys in BoringSSL + randombytes(shared_nok, SIKE_MSG_BYTESZ); + + // Recover m + // Let ciphertext = c0 || c1 - both have fixed sizes + // m = F(j-invariant(c0, priv_key)) ^ c1 + ex_iso_B(priv_key, ciphertext, j); + + shake256(secret, sizeof secret, j, sizeof j); + + + const uint8_t *c1 = &ciphertext[sizeof(c0)]; + for (size_t i = 0; i < SIKE_MSG_BYTESZ; i++) { + temp[i] = c1[i] ^ secret[i]; + } + + shake256_inc_init(&ctx); + shake256_inc_absorb(&ctx, temp, SIKE_MSG_BYTESZ); + shake256_inc_absorb(&ctx, pub_key, SIKE_PUB_BYTESZ); + shake256_inc_finalize(&ctx); + shake256_inc_squeeze(secret, 32, &ctx); + shake256_inc_ctx_release(&ctx); + + // Recover c0 = public key A + gen_iso_A(secret, c0); + crypto_word_t ok = ct_uint_eq( + ct_mem_eq(c0, ciphertext, SIKE_PUB_BYTESZ), 1); + for (size_t i = 0; i < SIKE_MSG_BYTESZ; i++) { + temp[i] = ct_select_8(ok, temp[i], shared_nok[i]); + } + + shake256_inc_init(&ctx); + shake256_inc_absorb(&ctx, temp, SIKE_MSG_BYTESZ); + shake256_inc_absorb(&ctx, ciphertext, SIKE_CT_BYTESZ); + shake256_inc_finalize(&ctx); + shake256_inc_squeeze(secret, 32, &ctx); + shake256_inc_ctx_release(&ctx); + + // Generate shared secret out_shared_key = SHA256(m||ciphertext) + memcpy(out_shared_key, secret, SIKE_SS_BYTESZ); +} diff --git a/src/kem/sike/p434/utils.h b/src/kem/sike/p434/utils.h new file mode 100644 index 00000000..e483d00f --- /dev/null +++ b/src/kem/sike/p434/utils.h @@ -0,0 +1,214 @@ +/******************************************************************************************** +* SIDH: an efficient supersingular isogeny cryptography library +* +* Abstract: internal header file for P434 +*********************************************************************************************/ + +#ifndef UTILS_H_ +#define UTILS_H_ + +#include +#include + +// Conversion macro from number of bits to number of bytes +#define BITS_TO_BYTES(nbits) (((nbits)+7)/8) + +// Bit size of the field +#define BITS_FIELD 434 +// Byte size of the field +#define FIELD_BYTESZ BITS_TO_BYTES(BITS_FIELD) +// Number of 64-bit words of a 224-bit element +#define NBITS_ORDER 224 +#define NWORDS64_ORDER ((NBITS_ORDER+63)/64) +// Number of elements in Alice's strategy +#define A_max 108 +// Number of elements in Bob's strategy +#define B_max 137 +// Word size size +#define RADIX sizeof(crypto_word_t)*8 +// Byte size of a limb +#define LSZ sizeof(crypto_word_t) + +#if defined(CPU_64_BIT) + typedef uint64_t crypto_word_t; + // Number of words of a 434-bit field element + #define NWORDS_FIELD 7 + // Number of "0" digits in the least significant part of p434 + 1 + #define ZERO_WORDS 3 + // U64_TO_WORDS expands |x| for a |crypto_word_t| array literal. + #define U64_TO_WORDS(x) UINT64_C(x) +#else + typedef uint32_t crypto_word_t; + // Number of words of a 434-bit field element + #define NWORDS_FIELD 14 + // Number of "0" digits in the least significant part of p434 + 1 + #define ZERO_WORDS 6 + // U64_TO_WORDS expands |x| for a |crypto_word_t| array literal. + #define U64_TO_WORDS(x) \ + (uint32_t)(UINT64_C(x) & 0xffffffff), (uint32_t)(UINT64_C(x) >> 32) +#endif + +// Extended datatype support +#if !defined(HAS_UINT128) + typedef uint64_t uint128_t[2]; +#endif + +// The following functions return 1 (TRUE) if condition is true, 0 (FALSE) otherwise +// Digit multiplication +#define MUL(multiplier, multiplicand, hi, lo) digit_x_digit((multiplier), (multiplicand), &(lo)); + +// If mask |x|==0xff.ff set |x| to 1, otherwise 0 +#define M2B(x) ((x)>>(RADIX-1)) + +// Digit addition with carry +#define ADDC(carryIn, addend1, addend2, carryOut, sumOut) \ +do { \ + crypto_word_t tempReg = (addend1) + (crypto_word_t)(carryIn); \ + (sumOut) = (addend2) + tempReg; \ + (carryOut) = M2B(ct_uint_lt(tempReg, (crypto_word_t)(carryIn)) | \ + ct_uint_lt((sumOut), tempReg)); \ +} while(0) + +// Digit subtraction with borrow +#define SUBC(borrowIn, minuend, subtrahend, borrowOut, differenceOut) \ +do { \ + crypto_word_t tempReg = (minuend) - (subtrahend); \ + crypto_word_t borrowReg = M2B(ct_uint_lt((minuend), (subtrahend))); \ + borrowReg |= ((borrowIn) & ct_uint_eq(tempReg, 0)); \ + (differenceOut) = tempReg - (crypto_word_t)(borrowIn); \ + (borrowOut) = borrowReg; \ +} while(0) + +/* Old GCC 4.9 (jessie) doesn't implement {0} initialization properly, + which violates C11 as described in 6.7.9, 21 (similarily C99, 6.7.8). + Defines below are used to work around the bug, and provide a way + to initialize f2elem_t and point_proj_t structs. + Bug has been fixed in GCC6 (debian stretch). +*/ +#define F2ELM_INIT {{ {0}, {0} }} +#define POINT_PROJ_INIT {{ F2ELM_INIT, F2ELM_INIT }} + +// Datatype for representing 434-bit field elements (448-bit max.) +// Elements over GF(p434) are encoded in 63 octets in little endian format +// (i.e., the least significant octet is located in the lowest memory address). +typedef crypto_word_t felm_t[NWORDS_FIELD]; + +// An element in F_{p^2}, is composed of two coefficients from F_p, * i.e. +// Fp2 element = c0 + c1*i in F_{p^2} +// Datatype for representing double-precision 2x434-bit field elements (448-bit max.) +// Elements (a+b*i) over GF(p434^2), where a and b are defined over GF(p434), are +// encoded as {a, b}, with a in the lowest memory portion. +typedef struct { + felm_t c0; + felm_t c1; +} fp2; + +// Our F_{p^2} element type is a pointer to the struct. +typedef fp2 f2elm_t[1]; + +// Datatype for representing double-precision 2x434-bit +// field elements in contiguous memory. +typedef crypto_word_t dfelm_t[2*NWORDS_FIELD]; + +// Constants used during SIKE computation. +struct params_t { + // Stores a prime + const crypto_word_t prime[NWORDS_FIELD]; + // Stores prime + 1 + const crypto_word_t prime_p1[NWORDS_FIELD]; + // Stores prime * 2 + const crypto_word_t prime_x2[NWORDS_FIELD]; + // Alice's generator values {XPA0 + XPA1*i, XQA0 + XQA1*i, XRA0 + XRA1*i} + // in GF(prime^2), expressed in Montgomery representation + const crypto_word_t A_gen[6*NWORDS_FIELD]; + // Bob's generator values {XPB0 + XPB1*i, XQB0 + XQB1*i, XRB0 + XRB1*i} + // in GF(prime^2), expressed in Montgomery representation + const crypto_word_t B_gen[6*NWORDS_FIELD]; + // Montgomery constant mont_R2 = (2^448)^2 mod prime + const crypto_word_t mont_R2[NWORDS_FIELD]; + // Value 'one' in Montgomery representation + const crypto_word_t mont_one[NWORDS_FIELD]; + // Value '6' in Montgomery representation + const crypto_word_t mont_six[NWORDS_FIELD]; + // Fixed parameters for isogeny tree computation + const unsigned int A_strat[A_max-1]; + const unsigned int B_strat[B_max-1]; +}; + +// Point representation in projective XZ Montgomery coordinates. +typedef struct { + f2elm_t X; + f2elm_t Z; +} point_proj; +typedef point_proj point_proj_t[1]; + +// Checks whether two words are equal. Returns 1 in case it is, +// otherwise 0. +static inline crypto_word_t ct_uint_eq(crypto_word_t x, crypto_word_t y) +{ + // if x==y then t = 0 + crypto_word_t t = x ^ y; + // if x!=y t will have first bit set + t = (t >> 1) - t; + // return MSB - 1 in case x==y, otherwise 0 + return ((~t) >> (RADIX-1)); +} +// Constant time select. +// if pick == 1 (out = in1) +// if pick == 0 (out = in2) +// else out is undefined +static inline uint8_t ct_select_8(uint8_t flag, uint8_t in1, uint8_t in2) { + uint8_t mask = ((int8_t)(flag << 7))>>7; + return (in1&mask) | (in2&(~mask)); +} + +// Constant time memcmp. Returns 1 if p==q, otherwise 0 +static inline int ct_mem_eq(const void *p, const void *q, size_t n) +{ + const uint8_t *pp = (uint8_t*)p, *qq = (uint8_t*)q; + uint8_t a = 0; + + while (n--) a |= *pp++ ^ *qq++; + return (ct_uint_eq(a, 0)); +} + +static inline crypto_word_t constant_time_msb_w(crypto_word_t a) { + return 0u - (a >> (sizeof(a) * 8 - 1)); +} + +// constant_time_lt_w returns 0xff..f if a < b and 0 otherwise. +static inline crypto_word_t ct_uint_lt(crypto_word_t x, crypto_word_t y) +{ + // Consider the two cases of the problem: + // msb(a) == msb(b): a < b iff the MSB of a - b is set. + // msb(a) != msb(b): a < b iff the MSB of b is set. + // + // If msb(a) == msb(b) then the following evaluates as: + // msb(a^((a^b)|((a-b)^a))) == + // msb(a^((a-b) ^ a)) == (because msb(a^b) == 0) + // msb(a^a^(a-b)) == (rearranging) + // msb(a-b) (because ∀x. x^x == 0) + // + // Else, if msb(a) != msb(b) then the following evaluates as: + // msb(a^((a^b)|((a-b)^a))) == + // msb(a^(𝟙 | ((a-b)^a))) == (because msb(a^b) == 1 and 𝟙 + // represents a value s.t. msb(𝟙) = 1) + // msb(a^𝟙) == (because ORing with 1 results in 1) + // msb(b) + // + // + // Here is an SMT-LIB verification of this formula: + // + // (define-fun lt ((a (_ BitVec 32)) (b (_ BitVec 32))) (_ BitVec 32) + // (bvxor a (bvor (bvxor a b) (bvxor (bvsub a b) a))) + // ) + // + // (declare-fun a () (_ BitVec 32)) + // (declare-fun b () (_ BitVec 32)) + // + // (assert (not (= (= #x00000001 (bvlshr (lt a b) #x0000001f)) (bvult a b)))) + // (check-sat) + // (get-model) + return constant_time_msb_w(x^((x^y)|((x-y)^x))); +} +#endif // UTILS_H_ From 51a41a31674d5ab572c7a30e47fcba21468819b2 Mon Sep 17 00:00:00 2001 From: Kris Kwiatkowski Date: Fri, 9 Apr 2021 00:54:14 +0100 Subject: [PATCH 04/12] update sike --- src/kem/sike/includes/sike/sike.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/kem/sike/includes/sike/sike.h b/src/kem/sike/includes/sike/sike.h index 09d1e580..e71cde57 100644 --- a/src/kem/sike/includes/sike/sike.h +++ b/src/kem/sike/includes/sike/sike.h @@ -3,6 +3,7 @@ #include #include +#include "randombytes.h" /* SIKE * @@ -55,7 +56,8 @@ #define PQCLEAN_SIKE434_CLEAN_CRYPTO_ALGNAME "SIKE/p434" static inline int PQCLEAN_SIKE434_CLEAN_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) { - SIKE_keypair(sk, pk); + randombytes(sk, SIKE_MSG_BYTESZ); + SIKE_keypair(sk+SIKE_MSG_BYTESZ, pk); memcpy(&sk[SIKE_PRV_BYTESZ+SIKE_MSG_BYTESZ], pk, SIKE_PUB_BYTESZ); return 1; } From a2a2b08a84cb4ec0c231bb5b3d2b9a0e7d64cc6d Mon Sep 17 00:00:00 2001 From: Kris Kwiatkowski Date: Fri, 9 Apr 2021 07:10:50 +0100 Subject: [PATCH 05/12] add cmake --- src/kem/sike/CMakeLists.txt | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 src/kem/sike/CMakeLists.txt diff --git a/src/kem/sike/CMakeLists.txt b/src/kem/sike/CMakeLists.txt new file mode 100644 index 00000000..0a55522e --- /dev/null +++ b/src/kem/sike/CMakeLists.txt @@ -0,0 +1,12 @@ +set( + SRC_CLEAN_SIKE_P434 + p434/fpx.c + p434/isogeny.c + p434/fp_generic.c + p434/params.c + p434/sike.c) + +define_kem_alg( + sike_p434_clean + PQC_SIKEP434_CLEAN "${SRC_CLEAN_SIKE_P434}" "${CMAKE_CURRENT_SOURCE_DIR}") + From 8711dcce1ad3909d682948fa29537a372719482d Mon Sep 17 00:00:00 2001 From: Kris Kwiatkowski Date: Fri, 9 Apr 2021 10:38:06 +0100 Subject: [PATCH 06/12] SIKE/p434 goes thru KATs --- README.md | 1 + src/kem/sike/includes/sike/sike.h | 21 ++++++++++++++------- src/kem/sike/p434/sike.c | 26 +++++++++++--------------- test/katrunner/Cargo.toml | 2 +- test/katrunner/src/main.rs | 7 ++++--- 5 files changed, 31 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 75d59645..696e00b5 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ Users shouldn't expect any level of security provided by this code. The library | Falcon | 2 | | | Rainbow | 3 | | | SPHINCS+ SHA256/SHAKE256 | 3 | x | +| SIKE/p434 | 3 | x | ## Building diff --git a/src/kem/sike/includes/sike/sike.h b/src/kem/sike/includes/sike/sike.h index e71cde57..ca0d03d7 100644 --- a/src/kem/sike/includes/sike/sike.h +++ b/src/kem/sike/includes/sike/sike.h @@ -49,26 +49,33 @@ const uint8_t priv_key[SIKE_PRV_BYTESZ]); // boilerplate needed for integration -#define PQCLEAN_SIKE434_CLEAN_CRYPTO_SECRETKEYBYTES SIKE_PRV_BYTESZ +#define PQCLEAN_SIKE434_CLEAN_CRYPTO_SECRETKEYBYTES SIKE_PRV_BYTESZ+SIKE_MSG_BYTESZ #define PQCLEAN_SIKE434_CLEAN_CRYPTO_PUBLICKEYBYTES SIKE_PUB_BYTESZ #define PQCLEAN_SIKE434_CLEAN_CRYPTO_CIPHERTEXTBYTES SIKE_CT_BYTESZ #define PQCLEAN_SIKE434_CLEAN_CRYPTO_BYTES SIKE_SS_BYTESZ #define PQCLEAN_SIKE434_CLEAN_CRYPTO_ALGNAME "SIKE/p434" +#define PQCLEAN_SIKE434_AVX2_CRYPTO_SECRETKEYBYTES SIKE_PRV_BYTESZ+SIKE_MSG_BYTESZ +#define PQCLEAN_SIKE434_AVX2_CRYPTO_PUBLICKEYBYTES SIKE_PUB_BYTESZ +#define PQCLEAN_SIKE434_AVX2_CRYPTO_CIPHERTEXTBYTES SIKE_CT_BYTESZ +#define PQCLEAN_SIKE434_AVX2_CRYPTO_BYTES SIKE_SS_BYTESZ +#define PQCLEAN_SIKE434_AVX2_CRYPTO_ALGNAME "SIKE/p434" + static inline int PQCLEAN_SIKE434_CLEAN_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) { - randombytes(sk, SIKE_MSG_BYTESZ); - SIKE_keypair(sk+SIKE_MSG_BYTESZ, pk); - memcpy(&sk[SIKE_PRV_BYTESZ+SIKE_MSG_BYTESZ], pk, SIKE_PUB_BYTESZ); - return 1; + SIKE_keypair(sk, pk); + // KATs require the public key to be concatenated after private key + // OZAPTF: maybe change KAT tester + memcpy(&sk[SIKE_MSG_BYTESZ+SIKE_PRV_BYTESZ], pk, SIKE_PUB_BYTESZ); + return 0; } static inline int PQCLEAN_SIKE434_CLEAN_crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { SIKE_encaps(ss,ct,pk); - return 1; + return 0; } static inline int PQCLEAN_SIKE434_CLEAN_crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { SIKE_decaps(ss, ct, &sk[SIKE_PRV_BYTESZ+SIKE_MSG_BYTESZ], sk); - return 1; + return 0; } diff --git a/src/kem/sike/p434/sike.c b/src/kem/sike/p434/sike.c index f52fe5c4..83a9dc1d 100644 --- a/src/kem/sike/p434/sike.c +++ b/src/kem/sike/p434/sike.c @@ -411,10 +411,10 @@ int SIKE_keypair(uint8_t out_priv[SIKE_PRV_BYTESZ], uint8_t out_pub[SIKE_PUB_BYTESZ]) { // Calculate private key for Alice. Needs to be in range [0, 2^0xFA - 1] and < // 253 bits - randombytes(out_priv, SIKE_PRV_BYTESZ); - out_priv[31] = (out_priv[31] | 0x01) & 0x03; - - gen_iso_B(out_priv, out_pub); + randombytes(out_priv, SIKE_MSG_BYTESZ); + randombytes(&out_priv[SIKE_MSG_BYTESZ], SIKE_PRV_BYTESZ); + out_priv[SIKE_MSG_BYTESZ+28-1] = (out_priv[SIKE_MSG_BYTESZ+28-1] & 0x01); + gen_iso_B(&out_priv[SIKE_MSG_BYTESZ], out_pub); return 1; } @@ -430,7 +430,7 @@ void SIKE_encaps(uint8_t out_shared_key[SIKE_SS_BYTESZ], shake256incctx ctx; // Generate secret key for A - // secret key A = SHA256({0,1}^n || pub_key)) mod SIDH_PRV_A_BITSZ + // secret key A = SHAKE256({0,1}^n || pub_key)) mod SIDH_PRV_A_BITSZ randombytes(temp, SIKE_MSG_BYTESZ); shake256_inc_init(&ctx); @@ -444,7 +444,7 @@ void SIKE_encaps(uint8_t out_shared_key[SIKE_SS_BYTESZ], gen_iso_A(secret, out_ciphertext); // Generate c1: - // h = SHA256(j-invariant) + // h = SHAKE256(j-invariant) // c1 = h ^ m ex_iso_A(secret, pub_key, j); shake256(secret, sizeof secret, j, sizeof j); @@ -461,14 +461,14 @@ void SIKE_encaps(uint8_t out_shared_key[SIKE_SS_BYTESZ], shake256_inc_finalize(&ctx); shake256_inc_squeeze(secret, 32, &ctx); shake256_inc_ctx_release(&ctx); - // Generate shared secret out_shared_key = SHA256(m||out_ciphertext) + // Generate shared secret out_shared_key = SHAKE256(m||out_ciphertext) memcpy(out_shared_key, secret, SIKE_SS_BYTESZ); } void SIKE_decaps(uint8_t out_shared_key[SIKE_SS_BYTESZ], const uint8_t ciphertext[SIKE_CT_BYTESZ], const uint8_t pub_key[SIKE_PUB_BYTESZ], - const uint8_t priv_key[SIKE_PRV_BYTESZ]) { + const uint8_t priv_key[SIKE_MSG_BYTESZ + SIKE_PRV_BYTESZ]) { // Secret buffer is reused by the function to store some ephemeral // secret data. It's size must be maximum of 64, // SIKE_MSG_BYTESZ and SIDH_PRV_A_BITSZ in bytes. @@ -476,16 +476,12 @@ void SIKE_decaps(uint8_t out_shared_key[SIKE_SS_BYTESZ], uint8_t j[SIDH_JINV_BYTESZ]; uint8_t c0[SIKE_PUB_BYTESZ]; uint8_t temp[SIKE_MSG_BYTESZ]; - uint8_t shared_nok[SIKE_MSG_BYTESZ]; shake256incctx ctx; - // This is OK as we are only using ephemeral keys in BoringSSL - randombytes(shared_nok, SIKE_MSG_BYTESZ); - // Recover m // Let ciphertext = c0 || c1 - both have fixed sizes // m = F(j-invariant(c0, priv_key)) ^ c1 - ex_iso_B(priv_key, ciphertext, j); + ex_iso_B(&priv_key[SIKE_MSG_BYTESZ], ciphertext, j); shake256(secret, sizeof secret, j, sizeof j); @@ -507,7 +503,7 @@ void SIKE_decaps(uint8_t out_shared_key[SIKE_SS_BYTESZ], crypto_word_t ok = ct_uint_eq( ct_mem_eq(c0, ciphertext, SIKE_PUB_BYTESZ), 1); for (size_t i = 0; i < SIKE_MSG_BYTESZ; i++) { - temp[i] = ct_select_8(ok, temp[i], shared_nok[i]); + temp[i] = ct_select_8(ok, temp[i], priv_key[i]); } shake256_inc_init(&ctx); @@ -517,6 +513,6 @@ void SIKE_decaps(uint8_t out_shared_key[SIKE_SS_BYTESZ], shake256_inc_squeeze(secret, 32, &ctx); shake256_inc_ctx_release(&ctx); - // Generate shared secret out_shared_key = SHA256(m||ciphertext) + // Generate shared secret out_shared_key = SHAKE256(m||ciphertext) memcpy(out_shared_key, secret, SIKE_SS_BYTESZ); } diff --git a/test/katrunner/Cargo.toml b/test/katrunner/Cargo.toml index c5adb4dc..fd07d6f3 100644 --- a/test/katrunner/Cargo.toml +++ b/test/katrunner/Cargo.toml @@ -11,4 +11,4 @@ hex = "0.4.2" threadpool = "1.8.1" rust-crypto = "^0.2" lazy_static = "1.4.0" -aes_ctr_drbg = "0.0.2" \ No newline at end of file +aes_ctr_drbg = "0.0.2" diff --git a/test/katrunner/src/main.rs b/test/katrunner/src/main.rs index f6ee752c..0da5c3e5 100644 --- a/test/katrunner/src/main.rs +++ b/test/katrunner/src/main.rs @@ -130,9 +130,10 @@ fn test_kem_vector(el: &TestVector) { // Check keygen pk.resize(el.kem.pk.len(), 0); sk.resize(el.kem.sk.len(), 0); - assert_eq!( - pqc_keygen(p, pk.as_mut_ptr(), sk.as_mut_ptr()), - true); + assert_eq!( + pqc_keygen(p, pk.as_mut_ptr(), sk.as_mut_ptr()), + true); + assert_eq!(sk, el.kem.sk); assert_eq!(pk, el.kem.pk); From c18ca419a8241bf2b895f135686409474f87766a Mon Sep 17 00:00:00 2001 From: Kris Kwiatkowski Date: Wed, 14 Apr 2021 08:04:14 +0100 Subject: [PATCH 07/12] SIKE: enable optimized version --- .gitmodules | 3 +++ 3rd/cpu_features | 1 + CMakeLists.txt | 4 ++++ src/capi/pqapi.c | 7 ++++++ src/capi/schemes.h | 2 +- src/kem/sike/CMakeLists.txt | 5 +++-- src/kem/sike/p434/fp-x86_64.S | 39 ++++++++++++++-------------------- src/kem/sike/p434/fp_generic.c | 38 +++++++++++++++++++++++++-------- src/kem/sike/p434/fp_glue.c | 4 ++++ 9 files changed, 68 insertions(+), 35 deletions(-) create mode 160000 3rd/cpu_features create mode 100644 src/kem/sike/p434/fp_glue.c diff --git a/.gitmodules b/.gitmodules index 84a57e26..b85836eb 100644 --- a/.gitmodules +++ b/.gitmodules @@ -7,3 +7,6 @@ [submodule "3rd/gbench"] path = 3rd/gbench url = https://github.com/henrydcase/benchmark.git +[submodule "3rd/cpu_features"] + path = 3rd/cpu_features + url = https://github.com/google/cpu_features.git diff --git a/3rd/cpu_features b/3rd/cpu_features new file mode 160000 index 00000000..3e8243b7 --- /dev/null +++ b/3rd/cpu_features @@ -0,0 +1 @@ +Subproject commit 3e8243b7d9951c078259c3186c039a6e8f036055 diff --git a/CMakeLists.txt b/CMakeLists.txt index 313200b3..f6961a8c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -34,6 +34,7 @@ else() endif() add_subdirectory(3rd/gtest) +add_subdirectory(3rd/cpu_features) # Arch settings @@ -82,6 +83,7 @@ include_directories( public src/common/ src + 3rd/cpu_features/include ) set_property(GLOBAL PROPERTY obj_libs "") @@ -243,12 +245,14 @@ get_property(OBJ_LIBS GLOBAL PROPERTY obj_libs) target_link_libraries( pqc common + cpu_features ${OBJ_LIBS} ) target_link_libraries( pqc_s common + cpu_features ${OBJ_LIBS} ) diff --git a/src/capi/pqapi.c b/src/capi/pqapi.c index d00260d3..8a76b40f 100644 --- a/src/capi/pqapi.c +++ b/src/capi/pqapi.c @@ -1,6 +1,7 @@ #include #include #include +#include #include "schemes.h" @@ -126,3 +127,9 @@ bool pqc_sig_verify(const params_t *p, const uint8_t *pk) { return !((sig_params_t *)p)->verify(sig, siglen, m, mlen, pk); } + +X86Features CPU_CAPS; +void static_initialization(void) __attribute__((constructor)); +void static_initialization(void) { + CPU_CAPS = GetX86Info().features; +} diff --git a/src/capi/schemes.h b/src/capi/schemes.h index 60a68893..9ba29c94 100644 --- a/src/capi/schemes.h +++ b/src/capi/schemes.h @@ -115,4 +115,4 @@ #include "kem/hqc/hqc-rmrs-128/avx2/api.h" #include "kem/hqc/hqc-rmrs-192/avx2/api.h" #include "kem/hqc/hqc-rmrs-256/avx2/api.h" -#include "kem/sike/includes/sike/sike.h" \ No newline at end of file +#include "kem/sike/includes/sike/sike.h" diff --git a/src/kem/sike/CMakeLists.txt b/src/kem/sike/CMakeLists.txt index 0a55522e..8eb116cc 100644 --- a/src/kem/sike/CMakeLists.txt +++ b/src/kem/sike/CMakeLists.txt @@ -1,12 +1,13 @@ set( SRC_CLEAN_SIKE_P434 p434/fpx.c - p434/isogeny.c p434/fp_generic.c + p434/fp_glue.c + p434/fp-x86_64.S + p434/isogeny.c p434/params.c p434/sike.c) define_kem_alg( sike_p434_clean PQC_SIKEP434_CLEAN "${SRC_CLEAN_SIKE_P434}" "${CMAKE_CURRENT_SOURCE_DIR}") - diff --git a/src/kem/sike/p434/fp-x86_64.S b/src/kem/sike/p434/fp-x86_64.S index f2f32392..e6f30b27 100644 --- a/src/kem/sike/p434/fp-x86_64.S +++ b/src/kem/sike/p434/fp-x86_64.S @@ -15,10 +15,10 @@ .quad 0x6CFC5FD681C52056 .quad 0x0002341F27177344 -.globl sike_fpadd -.hidden sike_fpadd -.type sike_fpadd,@function -sike_fpadd: +.globl sike_fpadd_asm +.hidden sike_fpadd_asm +.type sike_fpadd_asm,@function +sike_fpadd_asm: .cfi_startproc pushq %r12 .cfi_adjust_cfa_offset 8 @@ -107,14 +107,7 @@ sike_fpadd: .hidden sike_cswap_asm .type sike_cswap_asm,@function sike_cswap_asm: - - movq %rdx,%xmm3 - - - - - pshufd $68,%xmm3,%xmm3 movdqu 0(%rdi),%xmm0 @@ -258,10 +251,10 @@ sike_cswap_asm: movdqu %xmm1,208(%rsi) .byte 0xf3,0xc3 -.globl sike_fpsub -.hidden sike_fpsub -.type sike_fpsub,@function -sike_fpsub: +.globl sike_fpsub_asm +.hidden sike_fpsub_asm +.type sike_fpsub_asm,@function +sike_fpsub_asm: .cfi_startproc pushq %r12 .cfi_adjust_cfa_offset 8 @@ -508,10 +501,10 @@ sike_mpdblsubx2_asm: .byte 0xf3,0xc3 .cfi_endproc -.globl sike_fprdc -.hidden sike_fprdc -.type sike_fprdc,@function -sike_fprdc: +.globl sike_fprdc_asm +.hidden sike_fprdc_asm +.type sike_fprdc_asm,@function +sike_fprdc_asm: .cfi_startproc pushq %r12 .cfi_adjust_cfa_offset 8 @@ -723,10 +716,10 @@ sike_fprdc: .cfi_adjust_cfa_offset -8 .byte 0xf3,0xc3 .cfi_endproc -.globl sike_mpmul -.hidden sike_mpmul -.type sike_mpmul,@function -sike_mpmul: +.globl sike_mpmul_asm +.hidden sike_mpmul_asm +.type sike_mpmul_asm,@function +sike_mpmul_asm: .cfi_startproc pushq %r12 .cfi_adjust_cfa_offset 8 diff --git a/src/kem/sike/p434/fp_generic.c b/src/kem/sike/p434/fp_generic.c index 02e851cf..7fa75d1f 100644 --- a/src/kem/sike/p434/fp_generic.c +++ b/src/kem/sike/p434/fp_generic.c @@ -5,12 +5,16 @@ *********************************************************************************************/ #include "utils.h" #include "fpx.h" +#include + +extern X86Features CPU_CAPS; // Global constants extern const struct params_t params; +// Digit multiplication, digit * digit -> 2-digit result static void digit_x_digit(const crypto_word_t a, const crypto_word_t b, crypto_word_t* c) -{ // Digit multiplication, digit * digit -> 2-digit result +{ crypto_word_t al, ah, bl, bh, temp; crypto_word_t albl, albh, ahbl, ahbh, res1, res2, res3, carry; crypto_word_t mask_low = (crypto_word_t)(-1) >> (sizeof(crypto_word_t)*4); @@ -43,10 +47,11 @@ static void digit_x_digit(const crypto_word_t a, const crypto_word_t b, crypto_w c[1] ^= (ahbh & mask_high) + carry; // C11 } +// Modular addition, c = a+b mod p434. +// Inputs: a, b in [0, 2*p434-1] +// Output: c in [0, 2*p434-1] void sike_fpadd(const felm_t a, const felm_t b, felm_t c) -{ // Modular addition, c = a+b mod p434. - // Inputs: a, b in [0, 2*p434-1] - // Output: c in [0, 2*p434-1] +{ unsigned int i, carry = 0; crypto_word_t mask; @@ -84,12 +89,20 @@ void sike_fpsub(const felm_t a, const felm_t b, felm_t c) } } +// Multiprecision comba multiply, c = a*b, where lng(a) = lng(b) = NWORDS_FIELD. +void sike_mpmul_asm(const felm_t a, const felm_t b, dfelm_t c); void sike_mpmul(const felm_t a, const felm_t b, dfelm_t c) -{ // Multiprecision comba multiply, c = a*b, where lng(a) = lng(b) = NWORDS_FIELD. +{ unsigned int i, j; crypto_word_t t = 0, u = 0, v = 0, UV[2]; unsigned int carry = 0; + // TODO: it actually needs BMI2 & ADOX. cpu_features needs to be updated + if (CPU_CAPS.bmi2) { + sike_mpmul_asm(a,b,c); + return; + } + for (i = 0; i < NWORDS_FIELD; i++) { for (j = 0; j <= i; j++) { MUL(a[j], b[i-j], UV+1, UV[0]); @@ -118,11 +131,18 @@ void sike_mpmul(const felm_t a, const felm_t b, dfelm_t c) c[2*NWORDS_FIELD-1] = v; } +// Efficient Montgomery reduction using comba and exploiting the special form of the prime p434. +// mc = ma*R^-1 mod p434x2, where R = 2^448. +// If ma < 2^448*p434, the output mc is in the range [0, 2*p434-1]. +// ma is assumed to be in Montgomery representation. +void sike_fprdc_asm(const felm_t ma, felm_t mc); void sike_fprdc(const felm_t ma, felm_t mc) -{ // Efficient Montgomery reduction using comba and exploiting the special form of the prime p434. - // mc = ma*R^-1 mod p434x2, where R = 2^448. - // If ma < 2^448*p434, the output mc is in the range [0, 2*p434-1]. - // ma is assumed to be in Montgomery representation. +{ + if (CPU_CAPS.bmi2) { + sike_fprdc_asm(ma, mc); + return; + } + unsigned int i, j, carry, count = ZERO_WORDS; crypto_word_t UV[2], t = 0, u = 0, v = 0; diff --git a/src/kem/sike/p434/fp_glue.c b/src/kem/sike/p434/fp_glue.c new file mode 100644 index 00000000..0495a3a4 --- /dev/null +++ b/src/kem/sike/p434/fp_glue.c @@ -0,0 +1,4 @@ +#include "fpx.h" +#include "utils.h" + +void sike_mpmul_asm_X(const felm_t a, const felm_t b, dfelm_t c); \ No newline at end of file From ac4f2b7918c832b3241f633c9f7eb0325a8aeb6f Mon Sep 17 00:00:00 2001 From: Kris Kwiatkowski Date: Sun, 18 Apr 2021 23:40:08 +0100 Subject: [PATCH 08/12] change path to cpu_features submodule --- .gitmodules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index b85836eb..143e62d9 100644 --- a/.gitmodules +++ b/.gitmodules @@ -9,4 +9,4 @@ url = https://github.com/henrydcase/benchmark.git [submodule "3rd/cpu_features"] path = 3rd/cpu_features - url = https://github.com/google/cpu_features.git + url = https://github.com/henrydcase/cpu_features.git From 3683dcfa641a2b971d96fb00b4022d4bf5408752 Mon Sep 17 00:00:00 2001 From: Kris Kwiatkowski Date: Sun, 18 Apr 2021 23:44:53 +0100 Subject: [PATCH 09/12] update cpu_features submodule --- 3rd/cpu_features | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rd/cpu_features b/3rd/cpu_features index 3e8243b7..2b07c2ab 160000 --- a/3rd/cpu_features +++ b/3rd/cpu_features @@ -1 +1 @@ -Subproject commit 3e8243b7d9951c078259c3186c039a6e8f036055 +Subproject commit 2b07c2ab7df71d0b6c19afb93f68a808b412a7ff From 6b9aa0e10b2340ce012494738ae5aa6db1fd3371 Mon Sep 17 00:00:00 2001 From: Kris Kwiatkowski Date: Sun, 18 Apr 2021 23:50:59 +0100 Subject: [PATCH 10/12] check if adox available --- src/kem/sike/p434/fp_generic.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/kem/sike/p434/fp_generic.c b/src/kem/sike/p434/fp_generic.c index 7fa75d1f..8634b850 100644 --- a/src/kem/sike/p434/fp_generic.c +++ b/src/kem/sike/p434/fp_generic.c @@ -97,8 +97,7 @@ void sike_mpmul(const felm_t a, const felm_t b, dfelm_t c) crypto_word_t t = 0, u = 0, v = 0, UV[2]; unsigned int carry = 0; - // TODO: it actually needs BMI2 & ADOX. cpu_features needs to be updated - if (CPU_CAPS.bmi2) { + if (CPU_CAPS.bmi2 && CPU_CAPS.adx) { sike_mpmul_asm(a,b,c); return; } @@ -138,7 +137,7 @@ void sike_mpmul(const felm_t a, const felm_t b, dfelm_t c) void sike_fprdc_asm(const felm_t ma, felm_t mc); void sike_fprdc(const felm_t ma, felm_t mc) { - if (CPU_CAPS.bmi2) { + if (CPU_CAPS.bmi2 && CPU_CAPS.adx) { sike_fprdc_asm(ma, mc); return; } From 4dcce2cc7e2ed76441dcc36f25dc741865feb092 Mon Sep 17 00:00:00 2001 From: Kris Kwiatkowski Date: Sun, 18 Apr 2021 23:57:41 +0100 Subject: [PATCH 11/12] use haswell as default arch --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f6961a8c..14d0c09f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -156,7 +156,7 @@ add_subdirectory(src/kem/sike) if(${ARCH} STREQUAL "ARCH_x86_64") set(CMAKE_C_FLAGS - "${CMAKE_C_FLAGS} -march=native -mtune=native") + "${CMAKE_C_FLAGS} -march=haswell") set(SRC_COMMON_AVX2 src/common/keccak4x/KeccakP-1600-times4-SIMD256.c ) From de3f719a9dcb3326b3186b449b9ec62004328661 Mon Sep 17 00:00:00 2001 From: Kris Kwiatkowski Date: Sun, 18 Apr 2021 23:59:38 +0100 Subject: [PATCH 12/12] add drone.yml --- buid.dbg/.drone.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 buid.dbg/.drone.yml diff --git a/buid.dbg/.drone.yml b/buid.dbg/.drone.yml new file mode 100644 index 00000000..41c554ac --- /dev/null +++ b/buid.dbg/.drone.yml @@ -0,0 +1,14 @@ +kind: pipeline +type: exec +name: default + +steps: +- name: build + commands: + - git submodule init + - git submodule update --recursive --remote + - mkdir build + - cd build + - cmake .. + - make + - ./test