From c18ca419a8241bf2b895f135686409474f87766a Mon Sep 17 00:00:00 2001 From: Kris Kwiatkowski Date: Wed, 14 Apr 2021 08:04:14 +0100 Subject: [PATCH] SIKE: enable optimized version --- .gitmodules | 3 +++ 3rd/cpu_features | 1 + CMakeLists.txt | 4 ++++ src/capi/pqapi.c | 7 ++++++ src/capi/schemes.h | 2 +- src/kem/sike/CMakeLists.txt | 5 +++-- src/kem/sike/p434/fp-x86_64.S | 39 ++++++++++++++-------------------- src/kem/sike/p434/fp_generic.c | 38 +++++++++++++++++++++++++-------- src/kem/sike/p434/fp_glue.c | 4 ++++ 9 files changed, 68 insertions(+), 35 deletions(-) create mode 160000 3rd/cpu_features create mode 100644 src/kem/sike/p434/fp_glue.c diff --git a/.gitmodules b/.gitmodules index 84a57e26..b85836eb 100644 --- a/.gitmodules +++ b/.gitmodules @@ -7,3 +7,6 @@ [submodule "3rd/gbench"] path = 3rd/gbench url = https://github.com/henrydcase/benchmark.git +[submodule "3rd/cpu_features"] + path = 3rd/cpu_features + url = https://github.com/google/cpu_features.git diff --git a/3rd/cpu_features b/3rd/cpu_features new file mode 160000 index 00000000..3e8243b7 --- /dev/null +++ b/3rd/cpu_features @@ -0,0 +1 @@ +Subproject commit 3e8243b7d9951c078259c3186c039a6e8f036055 diff --git a/CMakeLists.txt b/CMakeLists.txt index 313200b3..f6961a8c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -34,6 +34,7 @@ else() endif() add_subdirectory(3rd/gtest) +add_subdirectory(3rd/cpu_features) # Arch settings @@ -82,6 +83,7 @@ include_directories( public src/common/ src + 3rd/cpu_features/include ) set_property(GLOBAL PROPERTY obj_libs "") @@ -243,12 +245,14 @@ get_property(OBJ_LIBS GLOBAL PROPERTY obj_libs) target_link_libraries( pqc common + cpu_features ${OBJ_LIBS} ) target_link_libraries( pqc_s common + cpu_features ${OBJ_LIBS} ) diff --git a/src/capi/pqapi.c b/src/capi/pqapi.c index d00260d3..8a76b40f 100644 --- a/src/capi/pqapi.c +++ b/src/capi/pqapi.c @@ -1,6 +1,7 @@ #include #include #include +#include #include "schemes.h" @@ -126,3 +127,9 @@ bool pqc_sig_verify(const params_t *p, const uint8_t *pk) { return !((sig_params_t *)p)->verify(sig, siglen, m, mlen, pk); } + +X86Features CPU_CAPS; +void static_initialization(void) __attribute__((constructor)); +void static_initialization(void) { + CPU_CAPS = GetX86Info().features; +} diff --git a/src/capi/schemes.h b/src/capi/schemes.h index 60a68893..9ba29c94 100644 --- a/src/capi/schemes.h +++ b/src/capi/schemes.h @@ -115,4 +115,4 @@ #include "kem/hqc/hqc-rmrs-128/avx2/api.h" #include "kem/hqc/hqc-rmrs-192/avx2/api.h" #include "kem/hqc/hqc-rmrs-256/avx2/api.h" -#include "kem/sike/includes/sike/sike.h" \ No newline at end of file +#include "kem/sike/includes/sike/sike.h" diff --git a/src/kem/sike/CMakeLists.txt b/src/kem/sike/CMakeLists.txt index 0a55522e..8eb116cc 100644 --- a/src/kem/sike/CMakeLists.txt +++ b/src/kem/sike/CMakeLists.txt @@ -1,12 +1,13 @@ set( SRC_CLEAN_SIKE_P434 p434/fpx.c - p434/isogeny.c p434/fp_generic.c + p434/fp_glue.c + p434/fp-x86_64.S + p434/isogeny.c p434/params.c p434/sike.c) define_kem_alg( sike_p434_clean PQC_SIKEP434_CLEAN "${SRC_CLEAN_SIKE_P434}" "${CMAKE_CURRENT_SOURCE_DIR}") - diff --git a/src/kem/sike/p434/fp-x86_64.S b/src/kem/sike/p434/fp-x86_64.S index f2f32392..e6f30b27 100644 --- a/src/kem/sike/p434/fp-x86_64.S +++ b/src/kem/sike/p434/fp-x86_64.S @@ -15,10 +15,10 @@ .quad 0x6CFC5FD681C52056 .quad 0x0002341F27177344 -.globl sike_fpadd -.hidden sike_fpadd -.type sike_fpadd,@function -sike_fpadd: +.globl sike_fpadd_asm +.hidden sike_fpadd_asm +.type sike_fpadd_asm,@function +sike_fpadd_asm: .cfi_startproc pushq %r12 .cfi_adjust_cfa_offset 8 @@ -107,14 +107,7 @@ sike_fpadd: .hidden sike_cswap_asm .type sike_cswap_asm,@function sike_cswap_asm: - - movq %rdx,%xmm3 - - - - - pshufd $68,%xmm3,%xmm3 movdqu 0(%rdi),%xmm0 @@ -258,10 +251,10 @@ sike_cswap_asm: movdqu %xmm1,208(%rsi) .byte 0xf3,0xc3 -.globl sike_fpsub -.hidden sike_fpsub -.type sike_fpsub,@function -sike_fpsub: +.globl sike_fpsub_asm +.hidden sike_fpsub_asm +.type sike_fpsub_asm,@function +sike_fpsub_asm: .cfi_startproc pushq %r12 .cfi_adjust_cfa_offset 8 @@ -508,10 +501,10 @@ sike_mpdblsubx2_asm: .byte 0xf3,0xc3 .cfi_endproc -.globl sike_fprdc -.hidden sike_fprdc -.type sike_fprdc,@function -sike_fprdc: +.globl sike_fprdc_asm +.hidden sike_fprdc_asm +.type sike_fprdc_asm,@function +sike_fprdc_asm: .cfi_startproc pushq %r12 .cfi_adjust_cfa_offset 8 @@ -723,10 +716,10 @@ sike_fprdc: .cfi_adjust_cfa_offset -8 .byte 0xf3,0xc3 .cfi_endproc -.globl sike_mpmul -.hidden sike_mpmul -.type sike_mpmul,@function -sike_mpmul: +.globl sike_mpmul_asm +.hidden sike_mpmul_asm +.type sike_mpmul_asm,@function +sike_mpmul_asm: .cfi_startproc pushq %r12 .cfi_adjust_cfa_offset 8 diff --git a/src/kem/sike/p434/fp_generic.c b/src/kem/sike/p434/fp_generic.c index 02e851cf..7fa75d1f 100644 --- a/src/kem/sike/p434/fp_generic.c +++ b/src/kem/sike/p434/fp_generic.c @@ -5,12 +5,16 @@ *********************************************************************************************/ #include "utils.h" #include "fpx.h" +#include + +extern X86Features CPU_CAPS; // Global constants extern const struct params_t params; +// Digit multiplication, digit * digit -> 2-digit result static void digit_x_digit(const crypto_word_t a, const crypto_word_t b, crypto_word_t* c) -{ // Digit multiplication, digit * digit -> 2-digit result +{ crypto_word_t al, ah, bl, bh, temp; crypto_word_t albl, albh, ahbl, ahbh, res1, res2, res3, carry; crypto_word_t mask_low = (crypto_word_t)(-1) >> (sizeof(crypto_word_t)*4); @@ -43,10 +47,11 @@ static void digit_x_digit(const crypto_word_t a, const crypto_word_t b, crypto_w c[1] ^= (ahbh & mask_high) + carry; // C11 } +// Modular addition, c = a+b mod p434. +// Inputs: a, b in [0, 2*p434-1] +// Output: c in [0, 2*p434-1] void sike_fpadd(const felm_t a, const felm_t b, felm_t c) -{ // Modular addition, c = a+b mod p434. - // Inputs: a, b in [0, 2*p434-1] - // Output: c in [0, 2*p434-1] +{ unsigned int i, carry = 0; crypto_word_t mask; @@ -84,12 +89,20 @@ void sike_fpsub(const felm_t a, const felm_t b, felm_t c) } } +// Multiprecision comba multiply, c = a*b, where lng(a) = lng(b) = NWORDS_FIELD. +void sike_mpmul_asm(const felm_t a, const felm_t b, dfelm_t c); void sike_mpmul(const felm_t a, const felm_t b, dfelm_t c) -{ // Multiprecision comba multiply, c = a*b, where lng(a) = lng(b) = NWORDS_FIELD. +{ unsigned int i, j; crypto_word_t t = 0, u = 0, v = 0, UV[2]; unsigned int carry = 0; + // TODO: it actually needs BMI2 & ADOX. cpu_features needs to be updated + if (CPU_CAPS.bmi2) { + sike_mpmul_asm(a,b,c); + return; + } + for (i = 0; i < NWORDS_FIELD; i++) { for (j = 0; j <= i; j++) { MUL(a[j], b[i-j], UV+1, UV[0]); @@ -118,11 +131,18 @@ void sike_mpmul(const felm_t a, const felm_t b, dfelm_t c) c[2*NWORDS_FIELD-1] = v; } +// Efficient Montgomery reduction using comba and exploiting the special form of the prime p434. +// mc = ma*R^-1 mod p434x2, where R = 2^448. +// If ma < 2^448*p434, the output mc is in the range [0, 2*p434-1]. +// ma is assumed to be in Montgomery representation. +void sike_fprdc_asm(const felm_t ma, felm_t mc); void sike_fprdc(const felm_t ma, felm_t mc) -{ // Efficient Montgomery reduction using comba and exploiting the special form of the prime p434. - // mc = ma*R^-1 mod p434x2, where R = 2^448. - // If ma < 2^448*p434, the output mc is in the range [0, 2*p434-1]. - // ma is assumed to be in Montgomery representation. +{ + if (CPU_CAPS.bmi2) { + sike_fprdc_asm(ma, mc); + return; + } + unsigned int i, j, carry, count = ZERO_WORDS; crypto_word_t UV[2], t = 0, u = 0, v = 0; diff --git a/src/kem/sike/p434/fp_glue.c b/src/kem/sike/p434/fp_glue.c new file mode 100644 index 00000000..0495a3a4 --- /dev/null +++ b/src/kem/sike/p434/fp_glue.c @@ -0,0 +1,4 @@ +#include "fpx.h" +#include "utils.h" + +void sike_mpmul_asm_X(const felm_t a, const felm_t b, dfelm_t c); \ No newline at end of file