From 8108f2b03f6ef69f225611d99ad3ab13dd904b65 Mon Sep 17 00:00:00 2001 From: Kris Kwiatkowski Date: Wed, 14 Apr 2021 08:04:14 +0100 Subject: [PATCH] SIKE: enable optimized version Adds cpu_features library from Google to recognize CPU capabilities on which implementation is running. Uses that library to run either generic-C or assembly optimized implementation of some field operations --- .gitmodules | 3 +++ 3rd/cpu_features | 1 + CMakeLists.txt | 15 ++++++++---- src/capi/pqapi.c | 13 +++++++++++ src/capi/schemes.h | 7 +++++- src/common/utils.h | 8 +++++++ src/kem/sike/CMakeLists.txt | 5 ++-- src/kem/sike/p434/fp-x86_64.S | 39 +++++++++++++------------------- src/kem/sike/p434/fp_generic.c | 37 ++++++++++++++++++++++-------- src/kem/sike/p434/fp_glue.c | 4 ++++ src/rustapi/pqc-sys/src/build.rs | 4 +++- 11 files changed, 96 insertions(+), 40 deletions(-) create mode 160000 3rd/cpu_features create mode 100644 src/common/utils.h create mode 100644 src/kem/sike/p434/fp_glue.c diff --git a/.gitmodules b/.gitmodules index 84a57e26..143e62d9 100644 --- a/.gitmodules +++ b/.gitmodules @@ -7,3 +7,6 @@ [submodule "3rd/gbench"] path = 3rd/gbench url = https://github.com/henrydcase/benchmark.git +[submodule "3rd/cpu_features"] + path = 3rd/cpu_features + url = https://github.com/henrydcase/cpu_features.git diff --git a/3rd/cpu_features b/3rd/cpu_features new file mode 160000 index 00000000..2b07c2ab --- /dev/null +++ b/3rd/cpu_features @@ -0,0 +1 @@ +Subproject commit 2b07c2ab7df71d0b6c19afb93f68a808b412a7ff diff --git a/CMakeLists.txt b/CMakeLists.txt index 89815671..55d38205 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,10 +1,13 @@ cmake_minimum_required(VERSION 3.13) -project(cryptocore NONE) +project(cryptocore VERSION 0.0.1 LANGUAGES C) enable_language(C) enable_language(CXX) enable_language(ASM) +add_subdirectory(3rd/gtest) +add_subdirectory(3rd/cpu_features) + set(CMAKE_VERBOSE_MAKEFILE ON) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "~/.cmake/Modules") set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "3rd/cmake-modules") @@ -48,7 +51,6 @@ if(NOT CMAKE_BUILD_TYPE_LOWER STREQUAL "debug") add_subdirectory(${CMAKE_SOURCE_DIR}/3rd/gbench) endif() - # Arch settings if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin") @@ -96,6 +98,7 @@ include_directories( public src/common/ src + 3rd/cpu_features/include ) set_property(GLOBAL PROPERTY obj_libs "") @@ -168,7 +171,7 @@ add_subdirectory(src/kem/sike) if(${ARCH} STREQUAL "ARCH_x86_64") set(CMAKE_C_FLAGS - "${CMAKE_C_FLAGS} -march=native -mtune=native") + "${CMAKE_C_FLAGS} -march=haswell") set(SRC_COMMON_AVX2 src/common/keccak4x/KeccakP-1600-times4-SIMD256.c ) @@ -256,12 +259,16 @@ get_property(OBJ_LIBS GLOBAL PROPERTY obj_libs) target_link_libraries( pqc - common + ${OBJ_LIBS} + cpu_features + common ) target_link_libraries( pqc_s + + cpu_features common ${OBJ_LIBS} ) diff --git a/src/capi/pqapi.c b/src/capi/pqapi.c index d00260d3..3ea5a57c 100644 --- a/src/capi/pqapi.c +++ b/src/capi/pqapi.c @@ -1,6 +1,7 @@ #include #include #include +#include #include "schemes.h" @@ -76,6 +77,13 @@ const sig_params_t sigs[] = { PQC_SUPPORTED_SIGS(REG_SIG) }; +// Contains capabilities on x86 CPU on which implementation is running +X86Features CPU_CAPS; + +const X86Features * const get_cpu_caps(void) { + return &CPU_CAPS; +} + const params_t *pqc_kem_alg_by_id(uint8_t id) { int i; for(i=0; iverify(sig, siglen, m, mlen, pk); } + +void static_initialization(void) __attribute__((constructor)); +void static_initialization(void) { + CPU_CAPS = GetX86Info().features; +} diff --git a/src/capi/schemes.h b/src/capi/schemes.h index 60a68893..c03563ee 100644 --- a/src/capi/schemes.h +++ b/src/capi/schemes.h @@ -1,3 +1,6 @@ +#ifndef PQC_SCHEMES_ +#define PQC_SCHEMES_ + // PQClean include #include "sign/rainbow/rainbowV-classic/clean/api.h" #include "sign/rainbow/rainbowI-classic/clean/api.h" @@ -115,4 +118,6 @@ #include "kem/hqc/hqc-rmrs-128/avx2/api.h" #include "kem/hqc/hqc-rmrs-192/avx2/api.h" #include "kem/hqc/hqc-rmrs-256/avx2/api.h" -#include "kem/sike/includes/sike/sike.h" \ No newline at end of file +#include "kem/sike/includes/sike/sike.h" + +#endif \ No newline at end of file diff --git a/src/common/utils.h b/src/common/utils.h new file mode 100644 index 00000000..469f67f2 --- /dev/null +++ b/src/common/utils.h @@ -0,0 +1,8 @@ +#ifndef PQC_COMMON_UTILS_ +#define PQC_COMMON_UTILS_ + +#include + +const X86Features * const get_cpu_caps(void); + +#endif \ No newline at end of file diff --git a/src/kem/sike/CMakeLists.txt b/src/kem/sike/CMakeLists.txt index 0a55522e..8eb116cc 100644 --- a/src/kem/sike/CMakeLists.txt +++ b/src/kem/sike/CMakeLists.txt @@ -1,12 +1,13 @@ set( SRC_CLEAN_SIKE_P434 p434/fpx.c - p434/isogeny.c p434/fp_generic.c + p434/fp_glue.c + p434/fp-x86_64.S + p434/isogeny.c p434/params.c p434/sike.c) define_kem_alg( sike_p434_clean PQC_SIKEP434_CLEAN "${SRC_CLEAN_SIKE_P434}" "${CMAKE_CURRENT_SOURCE_DIR}") - diff --git a/src/kem/sike/p434/fp-x86_64.S b/src/kem/sike/p434/fp-x86_64.S index f2f32392..e6f30b27 100644 --- a/src/kem/sike/p434/fp-x86_64.S +++ b/src/kem/sike/p434/fp-x86_64.S @@ -15,10 +15,10 @@ .quad 0x6CFC5FD681C52056 .quad 0x0002341F27177344 -.globl sike_fpadd -.hidden sike_fpadd -.type sike_fpadd,@function -sike_fpadd: +.globl sike_fpadd_asm +.hidden sike_fpadd_asm +.type sike_fpadd_asm,@function +sike_fpadd_asm: .cfi_startproc pushq %r12 .cfi_adjust_cfa_offset 8 @@ -107,14 +107,7 @@ sike_fpadd: .hidden sike_cswap_asm .type sike_cswap_asm,@function sike_cswap_asm: - - movq %rdx,%xmm3 - - - - - pshufd $68,%xmm3,%xmm3 movdqu 0(%rdi),%xmm0 @@ -258,10 +251,10 @@ sike_cswap_asm: movdqu %xmm1,208(%rsi) .byte 0xf3,0xc3 -.globl sike_fpsub -.hidden sike_fpsub -.type sike_fpsub,@function -sike_fpsub: +.globl sike_fpsub_asm +.hidden sike_fpsub_asm +.type sike_fpsub_asm,@function +sike_fpsub_asm: .cfi_startproc pushq %r12 .cfi_adjust_cfa_offset 8 @@ -508,10 +501,10 @@ sike_mpdblsubx2_asm: .byte 0xf3,0xc3 .cfi_endproc -.globl sike_fprdc -.hidden sike_fprdc -.type sike_fprdc,@function -sike_fprdc: +.globl sike_fprdc_asm +.hidden sike_fprdc_asm +.type sike_fprdc_asm,@function +sike_fprdc_asm: .cfi_startproc pushq %r12 .cfi_adjust_cfa_offset 8 @@ -723,10 +716,10 @@ sike_fprdc: .cfi_adjust_cfa_offset -8 .byte 0xf3,0xc3 .cfi_endproc -.globl sike_mpmul -.hidden sike_mpmul -.type sike_mpmul,@function -sike_mpmul: +.globl sike_mpmul_asm +.hidden sike_mpmul_asm +.type sike_mpmul_asm,@function +sike_mpmul_asm: .cfi_startproc pushq %r12 .cfi_adjust_cfa_offset 8 diff --git a/src/kem/sike/p434/fp_generic.c b/src/kem/sike/p434/fp_generic.c index 02e851cf..38e29926 100644 --- a/src/kem/sike/p434/fp_generic.c +++ b/src/kem/sike/p434/fp_generic.c @@ -3,14 +3,17 @@ * * Abstract: portable modular arithmetic for P503 *********************************************************************************************/ +#include "common/utils.h" + #include "utils.h" #include "fpx.h" // Global constants extern const struct params_t params; +// Digit multiplication, digit * digit -> 2-digit result static void digit_x_digit(const crypto_word_t a, const crypto_word_t b, crypto_word_t* c) -{ // Digit multiplication, digit * digit -> 2-digit result +{ crypto_word_t al, ah, bl, bh, temp; crypto_word_t albl, albh, ahbl, ahbh, res1, res2, res3, carry; crypto_word_t mask_low = (crypto_word_t)(-1) >> (sizeof(crypto_word_t)*4); @@ -43,10 +46,11 @@ static void digit_x_digit(const crypto_word_t a, const crypto_word_t b, crypto_w c[1] ^= (ahbh & mask_high) + carry; // C11 } +// Modular addition, c = a+b mod p434. +// Inputs: a, b in [0, 2*p434-1] +// Output: c in [0, 2*p434-1] void sike_fpadd(const felm_t a, const felm_t b, felm_t c) -{ // Modular addition, c = a+b mod p434. - // Inputs: a, b in [0, 2*p434-1] - // Output: c in [0, 2*p434-1] +{ unsigned int i, carry = 0; crypto_word_t mask; @@ -84,12 +88,20 @@ void sike_fpsub(const felm_t a, const felm_t b, felm_t c) } } +// Multiprecision comba multiply, c = a*b, where lng(a) = lng(b) = NWORDS_FIELD. +void sike_mpmul_asm(const felm_t a, const felm_t b, dfelm_t c); void sike_mpmul(const felm_t a, const felm_t b, dfelm_t c) -{ // Multiprecision comba multiply, c = a*b, where lng(a) = lng(b) = NWORDS_FIELD. +{ unsigned int i, j; crypto_word_t t = 0, u = 0, v = 0, UV[2]; unsigned int carry = 0; + // TODO: faster would be to use bitmap + if (get_cpu_caps()->bmi2 && get_cpu_caps()->adx) { + sike_mpmul_asm(a,b,c); + return; + } + for (i = 0; i < NWORDS_FIELD; i++) { for (j = 0; j <= i; j++) { MUL(a[j], b[i-j], UV+1, UV[0]); @@ -118,11 +130,18 @@ void sike_mpmul(const felm_t a, const felm_t b, dfelm_t c) c[2*NWORDS_FIELD-1] = v; } +// Efficient Montgomery reduction using comba and exploiting the special form of the prime p434. +// mc = ma*R^-1 mod p434x2, where R = 2^448. +// If ma < 2^448*p434, the output mc is in the range [0, 2*p434-1]. +// ma is assumed to be in Montgomery representation. +void sike_fprdc_asm(const felm_t ma, felm_t mc); void sike_fprdc(const felm_t ma, felm_t mc) -{ // Efficient Montgomery reduction using comba and exploiting the special form of the prime p434. - // mc = ma*R^-1 mod p434x2, where R = 2^448. - // If ma < 2^448*p434, the output mc is in the range [0, 2*p434-1]. - // ma is assumed to be in Montgomery representation. +{ + if (get_cpu_caps()->bmi2 && get_cpu_caps()->adx) { + sike_fprdc_asm(ma, mc); + return; + } + unsigned int i, j, carry, count = ZERO_WORDS; crypto_word_t UV[2], t = 0, u = 0, v = 0; diff --git a/src/kem/sike/p434/fp_glue.c b/src/kem/sike/p434/fp_glue.c new file mode 100644 index 00000000..0495a3a4 --- /dev/null +++ b/src/kem/sike/p434/fp_glue.c @@ -0,0 +1,4 @@ +#include "fpx.h" +#include "utils.h" + +void sike_mpmul_asm_X(const felm_t a, const felm_t b, dfelm_t c); \ No newline at end of file diff --git a/src/rustapi/pqc-sys/src/build.rs b/src/rustapi/pqc-sys/src/build.rs index 943a471e..761dd312 100644 --- a/src/rustapi/pqc-sys/src/build.rs +++ b/src/rustapi/pqc-sys/src/build.rs @@ -6,10 +6,12 @@ fn main() { let dst = Config::new("../../../") .profile("Release") .very_verbose(true) - .build(); + .build(); println!("cargo:rustc-link-search=native={}/lib", dst.display()); println!("cargo:rustc-link-lib=static=pqc_s"); + // For some reason GetX86Info symbol is undefined in the pqc_s. Hence this line + println!("cargo:rustc-link-lib=static=cpu_features"); println!("cargo:rerun-if-changed=../../../capi/*,../../../kem/*,../../../sign/*,../../../../public/pqc/pqc.h"); // The bindgen::Builder is the main entry point