SIKE: enable optimized version

Adds cpu_features library from Google to
recognize CPU capabilities on which implementation
is running.

Uses that library to run either generic-C
or assembly optimized implementation of
some field operations
This commit is contained in:
Henry Case 2021-04-14 08:04:14 +01:00 committed by Henry Case
parent 9cb7e5a265
commit 89a34ac04b
11 changed files with 96 additions and 40 deletions

3
.gitmodules vendored
View File

@ -7,3 +7,6 @@
[submodule "3rd/gbench"] [submodule "3rd/gbench"]
path = 3rd/gbench path = 3rd/gbench
url = https://github.com/henrydcase/benchmark.git url = https://github.com/henrydcase/benchmark.git
[submodule "3rd/cpu_features"]
path = 3rd/cpu_features
url = https://github.com/henrydcase/cpu_features.git

1
3rd/cpu_features Submodule

@ -0,0 +1 @@
Subproject commit 2b07c2ab7df71d0b6c19afb93f68a808b412a7ff

View File

@ -1,10 +1,13 @@
cmake_minimum_required(VERSION 3.13) cmake_minimum_required(VERSION 3.13)
project(cryptocore NONE) project(cryptocore VERSION 0.0.1 LANGUAGES C)
enable_language(C) enable_language(C)
enable_language(CXX) enable_language(CXX)
enable_language(ASM) enable_language(ASM)
add_subdirectory(3rd/gtest)
add_subdirectory(3rd/cpu_features)
set(CMAKE_VERBOSE_MAKEFILE ON) set(CMAKE_VERBOSE_MAKEFILE ON)
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "~/.cmake/Modules") set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "~/.cmake/Modules")
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "3rd/cmake-modules") set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "3rd/cmake-modules")
@ -48,7 +51,6 @@ if(NOT CMAKE_BUILD_TYPE_LOWER STREQUAL "debug")
add_subdirectory(${CMAKE_SOURCE_DIR}/3rd/gbench) add_subdirectory(${CMAKE_SOURCE_DIR}/3rd/gbench)
endif() endif()
# Arch settings # Arch settings
if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin") if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
@ -96,6 +98,7 @@ include_directories(
public public
src/common/ src/common/
src src
3rd/cpu_features/include
) )
set_property(GLOBAL PROPERTY obj_libs "") set_property(GLOBAL PROPERTY obj_libs "")
@ -168,7 +171,7 @@ add_subdirectory(src/kem/sike)
if(${ARCH} STREQUAL "ARCH_x86_64") if(${ARCH} STREQUAL "ARCH_x86_64")
set(CMAKE_C_FLAGS set(CMAKE_C_FLAGS
"${CMAKE_C_FLAGS} -march=native -mtune=native") "${CMAKE_C_FLAGS} -march=haswell")
set(SRC_COMMON_AVX2 set(SRC_COMMON_AVX2
src/common/keccak4x/KeccakP-1600-times4-SIMD256.c src/common/keccak4x/KeccakP-1600-times4-SIMD256.c
) )
@ -256,12 +259,16 @@ get_property(OBJ_LIBS GLOBAL PROPERTY obj_libs)
target_link_libraries( target_link_libraries(
pqc pqc
common
${OBJ_LIBS} ${OBJ_LIBS}
cpu_features
common
) )
target_link_libraries( target_link_libraries(
pqc_s pqc_s
cpu_features
common common
${OBJ_LIBS} ${OBJ_LIBS}
) )

View File

@ -1,6 +1,7 @@
#include <stdint.h> #include <stdint.h>
#include <stdbool.h> #include <stdbool.h>
#include <pqc/pqc.h> #include <pqc/pqc.h>
#include <cpuinfo_x86.h>
#include "schemes.h" #include "schemes.h"
@ -76,6 +77,13 @@ const sig_params_t sigs[] = {
PQC_SUPPORTED_SIGS(REG_SIG) PQC_SUPPORTED_SIGS(REG_SIG)
}; };
// Contains capabilities on x86 CPU on which implementation is running
X86Features CPU_CAPS;
const X86Features * const get_cpu_caps(void) {
return &CPU_CAPS;
}
const params_t *pqc_kem_alg_by_id(uint8_t id) { const params_t *pqc_kem_alg_by_id(uint8_t id) {
int i; int i;
for(i=0; i<PQC_ALG_KEM_MAX; i++) { for(i=0; i<PQC_ALG_KEM_MAX; i++) {
@ -126,3 +134,8 @@ bool pqc_sig_verify(const params_t *p,
const uint8_t *pk) { const uint8_t *pk) {
return !((sig_params_t *)p)->verify(sig, siglen, m, mlen, pk); return !((sig_params_t *)p)->verify(sig, siglen, m, mlen, pk);
} }
void static_initialization(void) __attribute__((constructor));
void static_initialization(void) {
CPU_CAPS = GetX86Info().features;
}

View File

@ -1,3 +1,6 @@
#ifndef PQC_SCHEMES_
#define PQC_SCHEMES_
// PQClean include // PQClean include
#include "sign/rainbow/rainbowV-classic/clean/api.h" #include "sign/rainbow/rainbowV-classic/clean/api.h"
#include "sign/rainbow/rainbowI-classic/clean/api.h" #include "sign/rainbow/rainbowI-classic/clean/api.h"
@ -116,3 +119,5 @@
#include "kem/hqc/hqc-rmrs-192/avx2/api.h" #include "kem/hqc/hqc-rmrs-192/avx2/api.h"
#include "kem/hqc/hqc-rmrs-256/avx2/api.h" #include "kem/hqc/hqc-rmrs-256/avx2/api.h"
#include "kem/sike/includes/sike/sike.h" #include "kem/sike/includes/sike/sike.h"
#endif

8
src/common/utils.h Normal file
View File

@ -0,0 +1,8 @@
#ifndef PQC_COMMON_UTILS_
#define PQC_COMMON_UTILS_
#include <cpuinfo_x86.h>
const X86Features * const get_cpu_caps(void);
#endif

View File

@ -1,12 +1,13 @@
set( set(
SRC_CLEAN_SIKE_P434 SRC_CLEAN_SIKE_P434
p434/fpx.c p434/fpx.c
p434/isogeny.c
p434/fp_generic.c p434/fp_generic.c
p434/fp_glue.c
p434/fp-x86_64.S
p434/isogeny.c
p434/params.c p434/params.c
p434/sike.c) p434/sike.c)
define_kem_alg( define_kem_alg(
sike_p434_clean sike_p434_clean
PQC_SIKEP434_CLEAN "${SRC_CLEAN_SIKE_P434}" "${CMAKE_CURRENT_SOURCE_DIR}") PQC_SIKEP434_CLEAN "${SRC_CLEAN_SIKE_P434}" "${CMAKE_CURRENT_SOURCE_DIR}")

View File

@ -15,10 +15,10 @@
.quad 0x6CFC5FD681C52056 .quad 0x6CFC5FD681C52056
.quad 0x0002341F27177344 .quad 0x0002341F27177344
.globl sike_fpadd .globl sike_fpadd_asm
.hidden sike_fpadd .hidden sike_fpadd_asm
.type sike_fpadd,@function .type sike_fpadd_asm,@function
sike_fpadd: sike_fpadd_asm:
.cfi_startproc .cfi_startproc
pushq %r12 pushq %r12
.cfi_adjust_cfa_offset 8 .cfi_adjust_cfa_offset 8
@ -107,14 +107,7 @@ sike_fpadd:
.hidden sike_cswap_asm .hidden sike_cswap_asm
.type sike_cswap_asm,@function .type sike_cswap_asm,@function
sike_cswap_asm: sike_cswap_asm:
movq %rdx,%xmm3 movq %rdx,%xmm3
pshufd $68,%xmm3,%xmm3 pshufd $68,%xmm3,%xmm3
movdqu 0(%rdi),%xmm0 movdqu 0(%rdi),%xmm0
@ -258,10 +251,10 @@ sike_cswap_asm:
movdqu %xmm1,208(%rsi) movdqu %xmm1,208(%rsi)
.byte 0xf3,0xc3 .byte 0xf3,0xc3
.globl sike_fpsub .globl sike_fpsub_asm
.hidden sike_fpsub .hidden sike_fpsub_asm
.type sike_fpsub,@function .type sike_fpsub_asm,@function
sike_fpsub: sike_fpsub_asm:
.cfi_startproc .cfi_startproc
pushq %r12 pushq %r12
.cfi_adjust_cfa_offset 8 .cfi_adjust_cfa_offset 8
@ -508,10 +501,10 @@ sike_mpdblsubx2_asm:
.byte 0xf3,0xc3 .byte 0xf3,0xc3
.cfi_endproc .cfi_endproc
.globl sike_fprdc .globl sike_fprdc_asm
.hidden sike_fprdc .hidden sike_fprdc_asm
.type sike_fprdc,@function .type sike_fprdc_asm,@function
sike_fprdc: sike_fprdc_asm:
.cfi_startproc .cfi_startproc
pushq %r12 pushq %r12
.cfi_adjust_cfa_offset 8 .cfi_adjust_cfa_offset 8
@ -723,10 +716,10 @@ sike_fprdc:
.cfi_adjust_cfa_offset -8 .cfi_adjust_cfa_offset -8
.byte 0xf3,0xc3 .byte 0xf3,0xc3
.cfi_endproc .cfi_endproc
.globl sike_mpmul .globl sike_mpmul_asm
.hidden sike_mpmul .hidden sike_mpmul_asm
.type sike_mpmul,@function .type sike_mpmul_asm,@function
sike_mpmul: sike_mpmul_asm:
.cfi_startproc .cfi_startproc
pushq %r12 pushq %r12
.cfi_adjust_cfa_offset 8 .cfi_adjust_cfa_offset 8

View File

@ -3,14 +3,17 @@
* *
* Abstract: portable modular arithmetic for P503 * Abstract: portable modular arithmetic for P503
*********************************************************************************************/ *********************************************************************************************/
#include "common/utils.h"
#include "utils.h" #include "utils.h"
#include "fpx.h" #include "fpx.h"
// Global constants // Global constants
extern const struct params_t params; extern const struct params_t params;
// Digit multiplication, digit * digit -> 2-digit result
static void digit_x_digit(const crypto_word_t a, const crypto_word_t b, crypto_word_t* c) static void digit_x_digit(const crypto_word_t a, const crypto_word_t b, crypto_word_t* c)
{ // Digit multiplication, digit * digit -> 2-digit result {
crypto_word_t al, ah, bl, bh, temp; crypto_word_t al, ah, bl, bh, temp;
crypto_word_t albl, albh, ahbl, ahbh, res1, res2, res3, carry; crypto_word_t albl, albh, ahbl, ahbh, res1, res2, res3, carry;
crypto_word_t mask_low = (crypto_word_t)(-1) >> (sizeof(crypto_word_t)*4); crypto_word_t mask_low = (crypto_word_t)(-1) >> (sizeof(crypto_word_t)*4);
@ -43,10 +46,11 @@ static void digit_x_digit(const crypto_word_t a, const crypto_word_t b, crypto_w
c[1] ^= (ahbh & mask_high) + carry; // C11 c[1] ^= (ahbh & mask_high) + carry; // C11
} }
void sike_fpadd(const felm_t a, const felm_t b, felm_t c) // Modular addition, c = a+b mod p434.
{ // Modular addition, c = a+b mod p434.
// Inputs: a, b in [0, 2*p434-1] // Inputs: a, b in [0, 2*p434-1]
// Output: c in [0, 2*p434-1] // Output: c in [0, 2*p434-1]
void sike_fpadd(const felm_t a, const felm_t b, felm_t c)
{
unsigned int i, carry = 0; unsigned int i, carry = 0;
crypto_word_t mask; crypto_word_t mask;
@ -84,12 +88,20 @@ void sike_fpsub(const felm_t a, const felm_t b, felm_t c)
} }
} }
// Multiprecision comba multiply, c = a*b, where lng(a) = lng(b) = NWORDS_FIELD.
void sike_mpmul_asm(const felm_t a, const felm_t b, dfelm_t c);
void sike_mpmul(const felm_t a, const felm_t b, dfelm_t c) void sike_mpmul(const felm_t a, const felm_t b, dfelm_t c)
{ // Multiprecision comba multiply, c = a*b, where lng(a) = lng(b) = NWORDS_FIELD. {
unsigned int i, j; unsigned int i, j;
crypto_word_t t = 0, u = 0, v = 0, UV[2]; crypto_word_t t = 0, u = 0, v = 0, UV[2];
unsigned int carry = 0; unsigned int carry = 0;
// TODO: faster would be to use bitmap
if (get_cpu_caps()->bmi2 && get_cpu_caps()->adx) {
sike_mpmul_asm(a,b,c);
return;
}
for (i = 0; i < NWORDS_FIELD; i++) { for (i = 0; i < NWORDS_FIELD; i++) {
for (j = 0; j <= i; j++) { for (j = 0; j <= i; j++) {
MUL(a[j], b[i-j], UV+1, UV[0]); MUL(a[j], b[i-j], UV+1, UV[0]);
@ -118,11 +130,18 @@ void sike_mpmul(const felm_t a, const felm_t b, dfelm_t c)
c[2*NWORDS_FIELD-1] = v; c[2*NWORDS_FIELD-1] = v;
} }
void sike_fprdc(const felm_t ma, felm_t mc) // Efficient Montgomery reduction using comba and exploiting the special form of the prime p434.
{ // Efficient Montgomery reduction using comba and exploiting the special form of the prime p434.
// mc = ma*R^-1 mod p434x2, where R = 2^448. // mc = ma*R^-1 mod p434x2, where R = 2^448.
// If ma < 2^448*p434, the output mc is in the range [0, 2*p434-1]. // If ma < 2^448*p434, the output mc is in the range [0, 2*p434-1].
// ma is assumed to be in Montgomery representation. // ma is assumed to be in Montgomery representation.
void sike_fprdc_asm(const felm_t ma, felm_t mc);
void sike_fprdc(const felm_t ma, felm_t mc)
{
if (get_cpu_caps()->bmi2 && get_cpu_caps()->adx) {
sike_fprdc_asm(ma, mc);
return;
}
unsigned int i, j, carry, count = ZERO_WORDS; unsigned int i, j, carry, count = ZERO_WORDS;
crypto_word_t UV[2], t = 0, u = 0, v = 0; crypto_word_t UV[2], t = 0, u = 0, v = 0;

View File

@ -0,0 +1,4 @@
#include "fpx.h"
#include "utils.h"
void sike_mpmul_asm_X(const felm_t a, const felm_t b, dfelm_t c);

View File

@ -10,6 +10,8 @@ fn main() {
println!("cargo:rustc-link-search=native={}/lib", dst.display()); println!("cargo:rustc-link-search=native={}/lib", dst.display());
println!("cargo:rustc-link-lib=static=pqc_s"); println!("cargo:rustc-link-lib=static=pqc_s");
// For some reason GetX86Info symbol is undefined in the pqc_s. Hence this line
println!("cargo:rustc-link-lib=static=cpu_features");
println!("cargo:rerun-if-changed=../../../capi/*,../../../kem/*,../../../sign/*,../../../../public/pqc/pqc.h"); println!("cargo:rerun-if-changed=../../../capi/*,../../../kem/*,../../../sign/*,../../../../public/pqc/pqc.h");
// The bindgen::Builder is the main entry point // The bindgen::Builder is the main entry point