Adds cpu_features library from Google to recognize CPU capabilities on which implementation is running. Uses that library to run either generic-C or assembly optimized implementation of some field operationstags/v0.0.1
@@ -7,3 +7,6 @@ | |||
[submodule "3rd/gbench"] | |||
path = 3rd/gbench | |||
url = https://github.com/henrydcase/benchmark.git | |||
[submodule "3rd/cpu_features"] | |||
path = 3rd/cpu_features | |||
url = https://github.com/henrydcase/cpu_features.git |
@@ -0,0 +1 @@ | |||
Subproject commit 2b07c2ab7df71d0b6c19afb93f68a808b412a7ff |
@@ -1,10 +1,13 @@ | |||
cmake_minimum_required(VERSION 3.13) | |||
project(cryptocore NONE) | |||
project(cryptocore VERSION 0.0.1 LANGUAGES C) | |||
enable_language(C) | |||
enable_language(CXX) | |||
enable_language(ASM) | |||
add_subdirectory(3rd/gtest) | |||
add_subdirectory(3rd/cpu_features) | |||
set(CMAKE_VERBOSE_MAKEFILE ON) | |||
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "~/.cmake/Modules") | |||
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "3rd/cmake-modules") | |||
@@ -48,7 +51,6 @@ if(NOT CMAKE_BUILD_TYPE_LOWER STREQUAL "debug") | |||
add_subdirectory(${CMAKE_SOURCE_DIR}/3rd/gbench) | |||
endif() | |||
# Arch settings | |||
if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin") | |||
@@ -96,6 +98,7 @@ include_directories( | |||
public | |||
src/common/ | |||
src | |||
3rd/cpu_features/include | |||
) | |||
set_property(GLOBAL PROPERTY obj_libs "") | |||
@@ -168,7 +171,7 @@ add_subdirectory(src/kem/sike) | |||
if(${ARCH} STREQUAL "ARCH_x86_64") | |||
set(CMAKE_C_FLAGS | |||
"${CMAKE_C_FLAGS} -march=native -mtune=native") | |||
"${CMAKE_C_FLAGS} -march=haswell") | |||
set(SRC_COMMON_AVX2 | |||
src/common/keccak4x/KeccakP-1600-times4-SIMD256.c | |||
) | |||
@@ -256,12 +259,16 @@ get_property(OBJ_LIBS GLOBAL PROPERTY obj_libs) | |||
target_link_libraries( | |||
pqc | |||
common | |||
${OBJ_LIBS} | |||
cpu_features | |||
common | |||
) | |||
target_link_libraries( | |||
pqc_s | |||
cpu_features | |||
common | |||
${OBJ_LIBS} | |||
) | |||
@@ -1,6 +1,7 @@ | |||
#include <stdint.h> | |||
#include <stdbool.h> | |||
#include <pqc/pqc.h> | |||
#include <cpuinfo_x86.h> | |||
#include "schemes.h" | |||
@@ -76,6 +77,13 @@ const sig_params_t sigs[] = { | |||
PQC_SUPPORTED_SIGS(REG_SIG) | |||
}; | |||
// Contains capabilities on x86 CPU on which implementation is running | |||
X86Features CPU_CAPS; | |||
const X86Features * const get_cpu_caps(void) { | |||
return &CPU_CAPS; | |||
} | |||
const params_t *pqc_kem_alg_by_id(uint8_t id) { | |||
int i; | |||
for(i=0; i<PQC_ALG_KEM_MAX; i++) { | |||
@@ -126,3 +134,8 @@ bool pqc_sig_verify(const params_t *p, | |||
const uint8_t *pk) { | |||
return !((sig_params_t *)p)->verify(sig, siglen, m, mlen, pk); | |||
} | |||
void static_initialization(void) __attribute__((constructor)); | |||
void static_initialization(void) { | |||
CPU_CAPS = GetX86Info().features; | |||
} |
@@ -1,3 +1,6 @@ | |||
#ifndef PQC_SCHEMES_ | |||
#define PQC_SCHEMES_ | |||
// PQClean include | |||
#include "sign/rainbow/rainbowV-classic/clean/api.h" | |||
#include "sign/rainbow/rainbowI-classic/clean/api.h" | |||
@@ -115,4 +118,6 @@ | |||
#include "kem/hqc/hqc-rmrs-128/avx2/api.h" | |||
#include "kem/hqc/hqc-rmrs-192/avx2/api.h" | |||
#include "kem/hqc/hqc-rmrs-256/avx2/api.h" | |||
#include "kem/sike/includes/sike/sike.h" | |||
#include "kem/sike/includes/sike/sike.h" | |||
#endif |
@@ -0,0 +1,8 @@ | |||
#ifndef PQC_COMMON_UTILS_ | |||
#define PQC_COMMON_UTILS_ | |||
#include <cpuinfo_x86.h> | |||
const X86Features * const get_cpu_caps(void); | |||
#endif |
@@ -1,12 +1,13 @@ | |||
set( | |||
SRC_CLEAN_SIKE_P434 | |||
p434/fpx.c | |||
p434/isogeny.c | |||
p434/fp_generic.c | |||
p434/fp_glue.c | |||
p434/fp-x86_64.S | |||
p434/isogeny.c | |||
p434/params.c | |||
p434/sike.c) | |||
define_kem_alg( | |||
sike_p434_clean | |||
PQC_SIKEP434_CLEAN "${SRC_CLEAN_SIKE_P434}" "${CMAKE_CURRENT_SOURCE_DIR}") | |||
@@ -15,10 +15,10 @@ | |||
.quad 0x6CFC5FD681C52056 | |||
.quad 0x0002341F27177344 | |||
.globl sike_fpadd | |||
.hidden sike_fpadd | |||
.type sike_fpadd,@function | |||
sike_fpadd: | |||
.globl sike_fpadd_asm | |||
.hidden sike_fpadd_asm | |||
.type sike_fpadd_asm,@function | |||
sike_fpadd_asm: | |||
.cfi_startproc | |||
pushq %r12 | |||
.cfi_adjust_cfa_offset 8 | |||
@@ -107,14 +107,7 @@ sike_fpadd: | |||
.hidden sike_cswap_asm | |||
.type sike_cswap_asm,@function | |||
sike_cswap_asm: | |||
movq %rdx,%xmm3 | |||
pshufd $68,%xmm3,%xmm3 | |||
movdqu 0(%rdi),%xmm0 | |||
@@ -258,10 +251,10 @@ sike_cswap_asm: | |||
movdqu %xmm1,208(%rsi) | |||
.byte 0xf3,0xc3 | |||
.globl sike_fpsub | |||
.hidden sike_fpsub | |||
.type sike_fpsub,@function | |||
sike_fpsub: | |||
.globl sike_fpsub_asm | |||
.hidden sike_fpsub_asm | |||
.type sike_fpsub_asm,@function | |||
sike_fpsub_asm: | |||
.cfi_startproc | |||
pushq %r12 | |||
.cfi_adjust_cfa_offset 8 | |||
@@ -508,10 +501,10 @@ sike_mpdblsubx2_asm: | |||
.byte 0xf3,0xc3 | |||
.cfi_endproc | |||
.globl sike_fprdc | |||
.hidden sike_fprdc | |||
.type sike_fprdc,@function | |||
sike_fprdc: | |||
.globl sike_fprdc_asm | |||
.hidden sike_fprdc_asm | |||
.type sike_fprdc_asm,@function | |||
sike_fprdc_asm: | |||
.cfi_startproc | |||
pushq %r12 | |||
.cfi_adjust_cfa_offset 8 | |||
@@ -723,10 +716,10 @@ sike_fprdc: | |||
.cfi_adjust_cfa_offset -8 | |||
.byte 0xf3,0xc3 | |||
.cfi_endproc | |||
.globl sike_mpmul | |||
.hidden sike_mpmul | |||
.type sike_mpmul,@function | |||
sike_mpmul: | |||
.globl sike_mpmul_asm | |||
.hidden sike_mpmul_asm | |||
.type sike_mpmul_asm,@function | |||
sike_mpmul_asm: | |||
.cfi_startproc | |||
pushq %r12 | |||
.cfi_adjust_cfa_offset 8 | |||
@@ -3,14 +3,17 @@ | |||
* | |||
* Abstract: portable modular arithmetic for P503 | |||
*********************************************************************************************/ | |||
#include "common/utils.h" | |||
#include "utils.h" | |||
#include "fpx.h" | |||
// Global constants | |||
extern const struct params_t params; | |||
// Digit multiplication, digit * digit -> 2-digit result | |||
static void digit_x_digit(const crypto_word_t a, const crypto_word_t b, crypto_word_t* c) | |||
{ // Digit multiplication, digit * digit -> 2-digit result | |||
{ | |||
crypto_word_t al, ah, bl, bh, temp; | |||
crypto_word_t albl, albh, ahbl, ahbh, res1, res2, res3, carry; | |||
crypto_word_t mask_low = (crypto_word_t)(-1) >> (sizeof(crypto_word_t)*4); | |||
@@ -43,10 +46,11 @@ static void digit_x_digit(const crypto_word_t a, const crypto_word_t b, crypto_w | |||
c[1] ^= (ahbh & mask_high) + carry; // C11 | |||
} | |||
// Modular addition, c = a+b mod p434. | |||
// Inputs: a, b in [0, 2*p434-1] | |||
// Output: c in [0, 2*p434-1] | |||
void sike_fpadd(const felm_t a, const felm_t b, felm_t c) | |||
{ // Modular addition, c = a+b mod p434. | |||
// Inputs: a, b in [0, 2*p434-1] | |||
// Output: c in [0, 2*p434-1] | |||
{ | |||
unsigned int i, carry = 0; | |||
crypto_word_t mask; | |||
@@ -84,12 +88,20 @@ void sike_fpsub(const felm_t a, const felm_t b, felm_t c) | |||
} | |||
} | |||
// Multiprecision comba multiply, c = a*b, where lng(a) = lng(b) = NWORDS_FIELD. | |||
void sike_mpmul_asm(const felm_t a, const felm_t b, dfelm_t c); | |||
void sike_mpmul(const felm_t a, const felm_t b, dfelm_t c) | |||
{ // Multiprecision comba multiply, c = a*b, where lng(a) = lng(b) = NWORDS_FIELD. | |||
{ | |||
unsigned int i, j; | |||
crypto_word_t t = 0, u = 0, v = 0, UV[2]; | |||
unsigned int carry = 0; | |||
// TODO: faster would be to use bitmap | |||
if (get_cpu_caps()->bmi2 && get_cpu_caps()->adx) { | |||
sike_mpmul_asm(a,b,c); | |||
return; | |||
} | |||
for (i = 0; i < NWORDS_FIELD; i++) { | |||
for (j = 0; j <= i; j++) { | |||
MUL(a[j], b[i-j], UV+1, UV[0]); | |||
@@ -118,11 +130,18 @@ void sike_mpmul(const felm_t a, const felm_t b, dfelm_t c) | |||
c[2*NWORDS_FIELD-1] = v; | |||
} | |||
// Efficient Montgomery reduction using comba and exploiting the special form of the prime p434. | |||
// mc = ma*R^-1 mod p434x2, where R = 2^448. | |||
// If ma < 2^448*p434, the output mc is in the range [0, 2*p434-1]. | |||
// ma is assumed to be in Montgomery representation. | |||
void sike_fprdc_asm(const felm_t ma, felm_t mc); | |||
void sike_fprdc(const felm_t ma, felm_t mc) | |||
{ // Efficient Montgomery reduction using comba and exploiting the special form of the prime p434. | |||
// mc = ma*R^-1 mod p434x2, where R = 2^448. | |||
// If ma < 2^448*p434, the output mc is in the range [0, 2*p434-1]. | |||
// ma is assumed to be in Montgomery representation. | |||
{ | |||
if (get_cpu_caps()->bmi2 && get_cpu_caps()->adx) { | |||
sike_fprdc_asm(ma, mc); | |||
return; | |||
} | |||
unsigned int i, j, carry, count = ZERO_WORDS; | |||
crypto_word_t UV[2], t = 0, u = 0, v = 0; | |||
@@ -0,0 +1,4 @@ | |||
#include "fpx.h" | |||
#include "utils.h" | |||
void sike_mpmul_asm_X(const felm_t a, const felm_t b, dfelm_t c); |
@@ -6,10 +6,12 @@ fn main() { | |||
let dst = Config::new("../../../") | |||
.profile("Release") | |||
.very_verbose(true) | |||
.build(); | |||
.build(); | |||
println!("cargo:rustc-link-search=native={}/lib", dst.display()); | |||
println!("cargo:rustc-link-lib=static=pqc_s"); | |||
// For some reason GetX86Info symbol is undefined in the pqc_s. Hence this line | |||
println!("cargo:rustc-link-lib=static=cpu_features"); | |||
println!("cargo:rerun-if-changed=../../../capi/*,../../../kem/*,../../../sign/*,../../../../public/pqc/pqc.h"); | |||
// The bindgen::Builder is the main entry point | |||