浏览代码

SIKE: enable optimized version

sike
Henry Case 3 年前
父节点
当前提交
c18ca419a8
共有 9 个文件被更改,包括 68 次插入35 次删除
  1. +3
    -0
      .gitmodules
  2. +1
    -0
      3rd/cpu_features
  3. +4
    -0
      CMakeLists.txt
  4. +7
    -0
      src/capi/pqapi.c
  5. +1
    -1
      src/capi/schemes.h
  6. +3
    -2
      src/kem/sike/CMakeLists.txt
  7. +16
    -23
      src/kem/sike/p434/fp-x86_64.S
  8. +29
    -9
      src/kem/sike/p434/fp_generic.c
  9. +4
    -0
      src/kem/sike/p434/fp_glue.c

+ 3
- 0
.gitmodules 查看文件

@@ -7,3 +7,6 @@
[submodule "3rd/gbench"]
path = 3rd/gbench
url = https://github.com/henrydcase/benchmark.git
[submodule "3rd/cpu_features"]
path = 3rd/cpu_features
url = https://github.com/google/cpu_features.git

+ 1
- 0
3rd/cpu_features

@@ -0,0 +1 @@
Subproject commit 3e8243b7d9951c078259c3186c039a6e8f036055

+ 4
- 0
CMakeLists.txt 查看文件

@@ -34,6 +34,7 @@ else()
endif()

add_subdirectory(3rd/gtest)
add_subdirectory(3rd/cpu_features)

# Arch settings

@@ -82,6 +83,7 @@ include_directories(
public
src/common/
src
3rd/cpu_features/include
)

set_property(GLOBAL PROPERTY obj_libs "")
@@ -243,12 +245,14 @@ get_property(OBJ_LIBS GLOBAL PROPERTY obj_libs)
target_link_libraries(
pqc
common
cpu_features
${OBJ_LIBS}
)

target_link_libraries(
pqc_s
common
cpu_features
${OBJ_LIBS}
)



+ 7
- 0
src/capi/pqapi.c 查看文件

@@ -1,6 +1,7 @@
#include <stdint.h>
#include <stdbool.h>
#include <pqc/pqc.h>
#include <cpuinfo_x86.h>

#include "schemes.h"

@@ -126,3 +127,9 @@ bool pqc_sig_verify(const params_t *p,
const uint8_t *pk) {
return !((sig_params_t *)p)->verify(sig, siglen, m, mlen, pk);
}

X86Features CPU_CAPS;
void static_initialization(void) __attribute__((constructor));
void static_initialization(void) {
CPU_CAPS = GetX86Info().features;
}

+ 1
- 1
src/capi/schemes.h 查看文件

@@ -115,4 +115,4 @@
#include "kem/hqc/hqc-rmrs-128/avx2/api.h"
#include "kem/hqc/hqc-rmrs-192/avx2/api.h"
#include "kem/hqc/hqc-rmrs-256/avx2/api.h"
#include "kem/sike/includes/sike/sike.h"
#include "kem/sike/includes/sike/sike.h"

+ 3
- 2
src/kem/sike/CMakeLists.txt 查看文件

@@ -1,12 +1,13 @@
set(
SRC_CLEAN_SIKE_P434
p434/fpx.c
p434/isogeny.c
p434/fp_generic.c
p434/fp_glue.c
p434/fp-x86_64.S
p434/isogeny.c
p434/params.c
p434/sike.c)

define_kem_alg(
sike_p434_clean
PQC_SIKEP434_CLEAN "${SRC_CLEAN_SIKE_P434}" "${CMAKE_CURRENT_SOURCE_DIR}")


+ 16
- 23
src/kem/sike/p434/fp-x86_64.S 查看文件

@@ -15,10 +15,10 @@
.quad 0x6CFC5FD681C52056
.quad 0x0002341F27177344

.globl sike_fpadd
.hidden sike_fpadd
.type sike_fpadd,@function
sike_fpadd:
.globl sike_fpadd_asm
.hidden sike_fpadd_asm
.type sike_fpadd_asm,@function
sike_fpadd_asm:
.cfi_startproc
pushq %r12
.cfi_adjust_cfa_offset 8
@@ -107,14 +107,7 @@ sike_fpadd:
.hidden sike_cswap_asm
.type sike_cswap_asm,@function
sike_cswap_asm:


movq %rdx,%xmm3





pshufd $68,%xmm3,%xmm3

movdqu 0(%rdi),%xmm0
@@ -258,10 +251,10 @@ sike_cswap_asm:
movdqu %xmm1,208(%rsi)

.byte 0xf3,0xc3
.globl sike_fpsub
.hidden sike_fpsub
.type sike_fpsub,@function
sike_fpsub:
.globl sike_fpsub_asm
.hidden sike_fpsub_asm
.type sike_fpsub_asm,@function
sike_fpsub_asm:
.cfi_startproc
pushq %r12
.cfi_adjust_cfa_offset 8
@@ -508,10 +501,10 @@ sike_mpdblsubx2_asm:
.byte 0xf3,0xc3
.cfi_endproc

.globl sike_fprdc
.hidden sike_fprdc
.type sike_fprdc,@function
sike_fprdc:
.globl sike_fprdc_asm
.hidden sike_fprdc_asm
.type sike_fprdc_asm,@function
sike_fprdc_asm:
.cfi_startproc
pushq %r12
.cfi_adjust_cfa_offset 8
@@ -723,10 +716,10 @@ sike_fprdc:
.cfi_adjust_cfa_offset -8
.byte 0xf3,0xc3
.cfi_endproc
.globl sike_mpmul
.hidden sike_mpmul
.type sike_mpmul,@function
sike_mpmul:
.globl sike_mpmul_asm
.hidden sike_mpmul_asm
.type sike_mpmul_asm,@function
sike_mpmul_asm:
.cfi_startproc
pushq %r12
.cfi_adjust_cfa_offset 8


+ 29
- 9
src/kem/sike/p434/fp_generic.c 查看文件

@@ -5,12 +5,16 @@
*********************************************************************************************/
#include "utils.h"
#include "fpx.h"
#include <cpuinfo_x86.h>

extern X86Features CPU_CAPS;

// Global constants
extern const struct params_t params;

// Digit multiplication, digit * digit -> 2-digit result
static void digit_x_digit(const crypto_word_t a, const crypto_word_t b, crypto_word_t* c)
{ // Digit multiplication, digit * digit -> 2-digit result
{
crypto_word_t al, ah, bl, bh, temp;
crypto_word_t albl, albh, ahbl, ahbh, res1, res2, res3, carry;
crypto_word_t mask_low = (crypto_word_t)(-1) >> (sizeof(crypto_word_t)*4);
@@ -43,10 +47,11 @@ static void digit_x_digit(const crypto_word_t a, const crypto_word_t b, crypto_w
c[1] ^= (ahbh & mask_high) + carry; // C11
}

// Modular addition, c = a+b mod p434.
// Inputs: a, b in [0, 2*p434-1]
// Output: c in [0, 2*p434-1]
void sike_fpadd(const felm_t a, const felm_t b, felm_t c)
{ // Modular addition, c = a+b mod p434.
// Inputs: a, b in [0, 2*p434-1]
// Output: c in [0, 2*p434-1]
{
unsigned int i, carry = 0;
crypto_word_t mask;

@@ -84,12 +89,20 @@ void sike_fpsub(const felm_t a, const felm_t b, felm_t c)
}
}

// Multiprecision comba multiply, c = a*b, where lng(a) = lng(b) = NWORDS_FIELD.
void sike_mpmul_asm(const felm_t a, const felm_t b, dfelm_t c);
void sike_mpmul(const felm_t a, const felm_t b, dfelm_t c)
{ // Multiprecision comba multiply, c = a*b, where lng(a) = lng(b) = NWORDS_FIELD.
{
unsigned int i, j;
crypto_word_t t = 0, u = 0, v = 0, UV[2];
unsigned int carry = 0;

// TODO: it actually needs BMI2 & ADOX. cpu_features needs to be updated
if (CPU_CAPS.bmi2) {
sike_mpmul_asm(a,b,c);
return;
}

for (i = 0; i < NWORDS_FIELD; i++) {
for (j = 0; j <= i; j++) {
MUL(a[j], b[i-j], UV+1, UV[0]);
@@ -118,11 +131,18 @@ void sike_mpmul(const felm_t a, const felm_t b, dfelm_t c)
c[2*NWORDS_FIELD-1] = v;
}

// Efficient Montgomery reduction using comba and exploiting the special form of the prime p434.
// mc = ma*R^-1 mod p434x2, where R = 2^448.
// If ma < 2^448*p434, the output mc is in the range [0, 2*p434-1].
// ma is assumed to be in Montgomery representation.
void sike_fprdc_asm(const felm_t ma, felm_t mc);
void sike_fprdc(const felm_t ma, felm_t mc)
{ // Efficient Montgomery reduction using comba and exploiting the special form of the prime p434.
// mc = ma*R^-1 mod p434x2, where R = 2^448.
// If ma < 2^448*p434, the output mc is in the range [0, 2*p434-1].
// ma is assumed to be in Montgomery representation.
{
if (CPU_CAPS.bmi2) {
sike_fprdc_asm(ma, mc);
return;
}

unsigned int i, j, carry, count = ZERO_WORDS;
crypto_word_t UV[2], t = 0, u = 0, v = 0;



+ 4
- 0
src/kem/sike/p434/fp_glue.c 查看文件

@@ -0,0 +1,4 @@
#include "fpx.h"
#include "utils.h"

void sike_mpmul_asm_X(const felm_t a, const felm_t b, dfelm_t c);

正在加载...
取消
保存