Adds cpu_features library from Google to recognize CPU capabilities on which implementation is running. Uses that library to run either generic-C or assembly optimized implementation of some field operationskris/add_picnic
@@ -7,3 +7,6 @@ | |||||
[submodule "3rd/gbench"] | [submodule "3rd/gbench"] | ||||
path = 3rd/gbench | path = 3rd/gbench | ||||
url = https://github.com/henrydcase/benchmark.git | url = https://github.com/henrydcase/benchmark.git | ||||
[submodule "3rd/cpu_features"] | |||||
path = 3rd/cpu_features | |||||
url = https://github.com/henrydcase/cpu_features.git |
@@ -0,0 +1 @@ | |||||
Subproject commit 2b07c2ab7df71d0b6c19afb93f68a808b412a7ff |
@@ -1,10 +1,13 @@ | |||||
cmake_minimum_required(VERSION 3.13) | cmake_minimum_required(VERSION 3.13) | ||||
project(cryptocore NONE) | |||||
project(cryptocore VERSION 0.0.1 LANGUAGES C) | |||||
enable_language(C) | enable_language(C) | ||||
enable_language(CXX) | enable_language(CXX) | ||||
enable_language(ASM) | enable_language(ASM) | ||||
add_subdirectory(3rd/gtest) | |||||
add_subdirectory(3rd/cpu_features) | |||||
set(CMAKE_VERBOSE_MAKEFILE ON) | set(CMAKE_VERBOSE_MAKEFILE ON) | ||||
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "~/.cmake/Modules") | set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "~/.cmake/Modules") | ||||
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "3rd/cmake-modules") | set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "3rd/cmake-modules") | ||||
@@ -48,7 +51,6 @@ if(NOT CMAKE_BUILD_TYPE_LOWER STREQUAL "debug") | |||||
add_subdirectory(${CMAKE_SOURCE_DIR}/3rd/gbench) | add_subdirectory(${CMAKE_SOURCE_DIR}/3rd/gbench) | ||||
endif() | endif() | ||||
# Arch settings | # Arch settings | ||||
if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin") | if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin") | ||||
@@ -96,6 +98,7 @@ include_directories( | |||||
public | public | ||||
src/common/ | src/common/ | ||||
src | src | ||||
3rd/cpu_features/include | |||||
) | ) | ||||
set_property(GLOBAL PROPERTY obj_libs "") | set_property(GLOBAL PROPERTY obj_libs "") | ||||
@@ -168,7 +171,7 @@ add_subdirectory(src/kem/sike) | |||||
if(${ARCH} STREQUAL "ARCH_x86_64") | if(${ARCH} STREQUAL "ARCH_x86_64") | ||||
set(CMAKE_C_FLAGS | set(CMAKE_C_FLAGS | ||||
"${CMAKE_C_FLAGS} -march=native -mtune=native") | |||||
"${CMAKE_C_FLAGS} -march=haswell") | |||||
set(SRC_COMMON_AVX2 | set(SRC_COMMON_AVX2 | ||||
src/common/keccak4x/KeccakP-1600-times4-SIMD256.c | src/common/keccak4x/KeccakP-1600-times4-SIMD256.c | ||||
) | ) | ||||
@@ -256,12 +259,16 @@ get_property(OBJ_LIBS GLOBAL PROPERTY obj_libs) | |||||
target_link_libraries( | target_link_libraries( | ||||
pqc | pqc | ||||
common | |||||
${OBJ_LIBS} | ${OBJ_LIBS} | ||||
cpu_features | |||||
common | |||||
) | ) | ||||
target_link_libraries( | target_link_libraries( | ||||
pqc_s | pqc_s | ||||
cpu_features | |||||
common | common | ||||
${OBJ_LIBS} | ${OBJ_LIBS} | ||||
) | ) | ||||
@@ -1,6 +1,7 @@ | |||||
#include <stdint.h> | #include <stdint.h> | ||||
#include <stdbool.h> | #include <stdbool.h> | ||||
#include <pqc/pqc.h> | #include <pqc/pqc.h> | ||||
#include <cpuinfo_x86.h> | |||||
#include "schemes.h" | #include "schemes.h" | ||||
@@ -76,6 +77,13 @@ const sig_params_t sigs[] = { | |||||
PQC_SUPPORTED_SIGS(REG_SIG) | PQC_SUPPORTED_SIGS(REG_SIG) | ||||
}; | }; | ||||
// Contains capabilities on x86 CPU on which implementation is running | |||||
X86Features CPU_CAPS; | |||||
const X86Features * const get_cpu_caps(void) { | |||||
return &CPU_CAPS; | |||||
} | |||||
const params_t *pqc_kem_alg_by_id(uint8_t id) { | const params_t *pqc_kem_alg_by_id(uint8_t id) { | ||||
int i; | int i; | ||||
for(i=0; i<PQC_ALG_KEM_MAX; i++) { | for(i=0; i<PQC_ALG_KEM_MAX; i++) { | ||||
@@ -126,3 +134,8 @@ bool pqc_sig_verify(const params_t *p, | |||||
const uint8_t *pk) { | const uint8_t *pk) { | ||||
return !((sig_params_t *)p)->verify(sig, siglen, m, mlen, pk); | return !((sig_params_t *)p)->verify(sig, siglen, m, mlen, pk); | ||||
} | } | ||||
void static_initialization(void) __attribute__((constructor)); | |||||
void static_initialization(void) { | |||||
CPU_CAPS = GetX86Info().features; | |||||
} |
@@ -1,3 +1,6 @@ | |||||
#ifndef PQC_SCHEMES_ | |||||
#define PQC_SCHEMES_ | |||||
// PQClean include | // PQClean include | ||||
#include "sign/rainbow/rainbowV-classic/clean/api.h" | #include "sign/rainbow/rainbowV-classic/clean/api.h" | ||||
#include "sign/rainbow/rainbowI-classic/clean/api.h" | #include "sign/rainbow/rainbowI-classic/clean/api.h" | ||||
@@ -115,4 +118,6 @@ | |||||
#include "kem/hqc/hqc-rmrs-128/avx2/api.h" | #include "kem/hqc/hqc-rmrs-128/avx2/api.h" | ||||
#include "kem/hqc/hqc-rmrs-192/avx2/api.h" | #include "kem/hqc/hqc-rmrs-192/avx2/api.h" | ||||
#include "kem/hqc/hqc-rmrs-256/avx2/api.h" | #include "kem/hqc/hqc-rmrs-256/avx2/api.h" | ||||
#include "kem/sike/includes/sike/sike.h" | |||||
#include "kem/sike/includes/sike/sike.h" | |||||
#endif |
@@ -0,0 +1,8 @@ | |||||
#ifndef PQC_COMMON_UTILS_ | |||||
#define PQC_COMMON_UTILS_ | |||||
#include <cpuinfo_x86.h> | |||||
const X86Features * const get_cpu_caps(void); | |||||
#endif |
@@ -1,12 +1,13 @@ | |||||
set( | set( | ||||
SRC_CLEAN_SIKE_P434 | SRC_CLEAN_SIKE_P434 | ||||
p434/fpx.c | p434/fpx.c | ||||
p434/isogeny.c | |||||
p434/fp_generic.c | p434/fp_generic.c | ||||
p434/fp_glue.c | |||||
p434/fp-x86_64.S | |||||
p434/isogeny.c | |||||
p434/params.c | p434/params.c | ||||
p434/sike.c) | p434/sike.c) | ||||
define_kem_alg( | define_kem_alg( | ||||
sike_p434_clean | sike_p434_clean | ||||
PQC_SIKEP434_CLEAN "${SRC_CLEAN_SIKE_P434}" "${CMAKE_CURRENT_SOURCE_DIR}") | PQC_SIKEP434_CLEAN "${SRC_CLEAN_SIKE_P434}" "${CMAKE_CURRENT_SOURCE_DIR}") | ||||
@@ -15,10 +15,10 @@ | |||||
.quad 0x6CFC5FD681C52056 | .quad 0x6CFC5FD681C52056 | ||||
.quad 0x0002341F27177344 | .quad 0x0002341F27177344 | ||||
.globl sike_fpadd | |||||
.hidden sike_fpadd | |||||
.type sike_fpadd,@function | |||||
sike_fpadd: | |||||
.globl sike_fpadd_asm | |||||
.hidden sike_fpadd_asm | |||||
.type sike_fpadd_asm,@function | |||||
sike_fpadd_asm: | |||||
.cfi_startproc | .cfi_startproc | ||||
pushq %r12 | pushq %r12 | ||||
.cfi_adjust_cfa_offset 8 | .cfi_adjust_cfa_offset 8 | ||||
@@ -107,14 +107,7 @@ sike_fpadd: | |||||
.hidden sike_cswap_asm | .hidden sike_cswap_asm | ||||
.type sike_cswap_asm,@function | .type sike_cswap_asm,@function | ||||
sike_cswap_asm: | sike_cswap_asm: | ||||
movq %rdx,%xmm3 | movq %rdx,%xmm3 | ||||
pshufd $68,%xmm3,%xmm3 | pshufd $68,%xmm3,%xmm3 | ||||
movdqu 0(%rdi),%xmm0 | movdqu 0(%rdi),%xmm0 | ||||
@@ -258,10 +251,10 @@ sike_cswap_asm: | |||||
movdqu %xmm1,208(%rsi) | movdqu %xmm1,208(%rsi) | ||||
.byte 0xf3,0xc3 | .byte 0xf3,0xc3 | ||||
.globl sike_fpsub | |||||
.hidden sike_fpsub | |||||
.type sike_fpsub,@function | |||||
sike_fpsub: | |||||
.globl sike_fpsub_asm | |||||
.hidden sike_fpsub_asm | |||||
.type sike_fpsub_asm,@function | |||||
sike_fpsub_asm: | |||||
.cfi_startproc | .cfi_startproc | ||||
pushq %r12 | pushq %r12 | ||||
.cfi_adjust_cfa_offset 8 | .cfi_adjust_cfa_offset 8 | ||||
@@ -508,10 +501,10 @@ sike_mpdblsubx2_asm: | |||||
.byte 0xf3,0xc3 | .byte 0xf3,0xc3 | ||||
.cfi_endproc | .cfi_endproc | ||||
.globl sike_fprdc | |||||
.hidden sike_fprdc | |||||
.type sike_fprdc,@function | |||||
sike_fprdc: | |||||
.globl sike_fprdc_asm | |||||
.hidden sike_fprdc_asm | |||||
.type sike_fprdc_asm,@function | |||||
sike_fprdc_asm: | |||||
.cfi_startproc | .cfi_startproc | ||||
pushq %r12 | pushq %r12 | ||||
.cfi_adjust_cfa_offset 8 | .cfi_adjust_cfa_offset 8 | ||||
@@ -723,10 +716,10 @@ sike_fprdc: | |||||
.cfi_adjust_cfa_offset -8 | .cfi_adjust_cfa_offset -8 | ||||
.byte 0xf3,0xc3 | .byte 0xf3,0xc3 | ||||
.cfi_endproc | .cfi_endproc | ||||
.globl sike_mpmul | |||||
.hidden sike_mpmul | |||||
.type sike_mpmul,@function | |||||
sike_mpmul: | |||||
.globl sike_mpmul_asm | |||||
.hidden sike_mpmul_asm | |||||
.type sike_mpmul_asm,@function | |||||
sike_mpmul_asm: | |||||
.cfi_startproc | .cfi_startproc | ||||
pushq %r12 | pushq %r12 | ||||
.cfi_adjust_cfa_offset 8 | .cfi_adjust_cfa_offset 8 | ||||
@@ -3,14 +3,17 @@ | |||||
* | * | ||||
* Abstract: portable modular arithmetic for P503 | * Abstract: portable modular arithmetic for P503 | ||||
*********************************************************************************************/ | *********************************************************************************************/ | ||||
#include "common/utils.h" | |||||
#include "utils.h" | #include "utils.h" | ||||
#include "fpx.h" | #include "fpx.h" | ||||
// Global constants | // Global constants | ||||
extern const struct params_t params; | extern const struct params_t params; | ||||
// Digit multiplication, digit * digit -> 2-digit result | |||||
static void digit_x_digit(const crypto_word_t a, const crypto_word_t b, crypto_word_t* c) | static void digit_x_digit(const crypto_word_t a, const crypto_word_t b, crypto_word_t* c) | ||||
{ // Digit multiplication, digit * digit -> 2-digit result | |||||
{ | |||||
crypto_word_t al, ah, bl, bh, temp; | crypto_word_t al, ah, bl, bh, temp; | ||||
crypto_word_t albl, albh, ahbl, ahbh, res1, res2, res3, carry; | crypto_word_t albl, albh, ahbl, ahbh, res1, res2, res3, carry; | ||||
crypto_word_t mask_low = (crypto_word_t)(-1) >> (sizeof(crypto_word_t)*4); | crypto_word_t mask_low = (crypto_word_t)(-1) >> (sizeof(crypto_word_t)*4); | ||||
@@ -43,10 +46,11 @@ static void digit_x_digit(const crypto_word_t a, const crypto_word_t b, crypto_w | |||||
c[1] ^= (ahbh & mask_high) + carry; // C11 | c[1] ^= (ahbh & mask_high) + carry; // C11 | ||||
} | } | ||||
// Modular addition, c = a+b mod p434. | |||||
// Inputs: a, b in [0, 2*p434-1] | |||||
// Output: c in [0, 2*p434-1] | |||||
void sike_fpadd(const felm_t a, const felm_t b, felm_t c) | void sike_fpadd(const felm_t a, const felm_t b, felm_t c) | ||||
{ // Modular addition, c = a+b mod p434. | |||||
// Inputs: a, b in [0, 2*p434-1] | |||||
// Output: c in [0, 2*p434-1] | |||||
{ | |||||
unsigned int i, carry = 0; | unsigned int i, carry = 0; | ||||
crypto_word_t mask; | crypto_word_t mask; | ||||
@@ -84,12 +88,20 @@ void sike_fpsub(const felm_t a, const felm_t b, felm_t c) | |||||
} | } | ||||
} | } | ||||
// Multiprecision comba multiply, c = a*b, where lng(a) = lng(b) = NWORDS_FIELD. | |||||
void sike_mpmul_asm(const felm_t a, const felm_t b, dfelm_t c); | |||||
void sike_mpmul(const felm_t a, const felm_t b, dfelm_t c) | void sike_mpmul(const felm_t a, const felm_t b, dfelm_t c) | ||||
{ // Multiprecision comba multiply, c = a*b, where lng(a) = lng(b) = NWORDS_FIELD. | |||||
{ | |||||
unsigned int i, j; | unsigned int i, j; | ||||
crypto_word_t t = 0, u = 0, v = 0, UV[2]; | crypto_word_t t = 0, u = 0, v = 0, UV[2]; | ||||
unsigned int carry = 0; | unsigned int carry = 0; | ||||
// TODO: faster would be to use bitmap | |||||
if (get_cpu_caps()->bmi2 && get_cpu_caps()->adx) { | |||||
sike_mpmul_asm(a,b,c); | |||||
return; | |||||
} | |||||
for (i = 0; i < NWORDS_FIELD; i++) { | for (i = 0; i < NWORDS_FIELD; i++) { | ||||
for (j = 0; j <= i; j++) { | for (j = 0; j <= i; j++) { | ||||
MUL(a[j], b[i-j], UV+1, UV[0]); | MUL(a[j], b[i-j], UV+1, UV[0]); | ||||
@@ -118,11 +130,18 @@ void sike_mpmul(const felm_t a, const felm_t b, dfelm_t c) | |||||
c[2*NWORDS_FIELD-1] = v; | c[2*NWORDS_FIELD-1] = v; | ||||
} | } | ||||
// Efficient Montgomery reduction using comba and exploiting the special form of the prime p434. | |||||
// mc = ma*R^-1 mod p434x2, where R = 2^448. | |||||
// If ma < 2^448*p434, the output mc is in the range [0, 2*p434-1]. | |||||
// ma is assumed to be in Montgomery representation. | |||||
void sike_fprdc_asm(const felm_t ma, felm_t mc); | |||||
void sike_fprdc(const felm_t ma, felm_t mc) | void sike_fprdc(const felm_t ma, felm_t mc) | ||||
{ // Efficient Montgomery reduction using comba and exploiting the special form of the prime p434. | |||||
// mc = ma*R^-1 mod p434x2, where R = 2^448. | |||||
// If ma < 2^448*p434, the output mc is in the range [0, 2*p434-1]. | |||||
// ma is assumed to be in Montgomery representation. | |||||
{ | |||||
if (get_cpu_caps()->bmi2 && get_cpu_caps()->adx) { | |||||
sike_fprdc_asm(ma, mc); | |||||
return; | |||||
} | |||||
unsigned int i, j, carry, count = ZERO_WORDS; | unsigned int i, j, carry, count = ZERO_WORDS; | ||||
crypto_word_t UV[2], t = 0, u = 0, v = 0; | crypto_word_t UV[2], t = 0, u = 0, v = 0; | ||||
@@ -0,0 +1,4 @@ | |||||
#include "fpx.h" | |||||
#include "utils.h" | |||||
void sike_mpmul_asm_X(const felm_t a, const felm_t b, dfelm_t c); |
@@ -6,10 +6,12 @@ fn main() { | |||||
let dst = Config::new("../../../") | let dst = Config::new("../../../") | ||||
.profile("Release") | .profile("Release") | ||||
.very_verbose(true) | .very_verbose(true) | ||||
.build(); | |||||
.build(); | |||||
println!("cargo:rustc-link-search=native={}/lib", dst.display()); | println!("cargo:rustc-link-search=native={}/lib", dst.display()); | ||||
println!("cargo:rustc-link-lib=static=pqc_s"); | println!("cargo:rustc-link-lib=static=pqc_s"); | ||||
// For some reason GetX86Info symbol is undefined in the pqc_s. Hence this line | |||||
println!("cargo:rustc-link-lib=static=cpu_features"); | |||||
println!("cargo:rerun-if-changed=../../../capi/*,../../../kem/*,../../../sign/*,../../../../public/pqc/pqc.h"); | println!("cargo:rerun-if-changed=../../../capi/*,../../../kem/*,../../../sign/*,../../../../public/pqc/pqc.h"); | ||||
// The bindgen::Builder is the main entry point | // The bindgen::Builder is the main entry point | ||||