barett based reduction

3 years ago · 54d54ce47b
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -555,6 +555,14 @@ target_link_libraries(
  pqclean_dilithium5_clean
 )

 install(TARGETS pqclean pqclean_s
  PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ GROUP_WRITE WORLD_READ WORLD_WRITE
  LIBRARY DESTINATION lib
  ARCHIVE DESTINATION lib)
 install(FILES
  ${QRS_PUBLIC_INC}
  DESTINATION include/pqclean)

 # TODO: this requires changes to testvectors.c
 # add_executable(
 #   test
--- a/src/sign/dilithium/dilithium2/clean/rounding.c
+++ b/src/sign/dilithium/dilithium2/clean/rounding.c
@@ -38,66 +38,126 @@ int32_t PQCLEAN_DILITHIUM2_CLEAN_power2round(int32_t *a0, int32_t a)  {
 *
 * Returns a1.
 **************************************************/
 int32_t PQCLEAN_DILITHIUM2_CLEAN_decompose(int32_t *a0, int32_t a) {
    int32_t a1 = 0;
    uint64_t r;

    int32_t r0, r1;
 int32_t PQCLEAN_DILITHIUM2_CLEAN_decompose_ORG(int32_t *a0, int32_t a) {
    /* TODO:
        a % Q is skipped, as it seems a<Q always. In case this
        needs to be done, then we can use a fact that Q is a
        Generalized Marsenne Prime, so modular redc is fast
        (see work by Jerome Solina and Crandall '92 algo).
    */

    assert(a>0); assert(a<Q);

    // mod ALPHA
    static const uint32_t u = 360800;
    r = ((uint64_t)a)*u;
    r >>= 36;
    r *= 2 * GAMMA2;
    r = a - r;

    if (r>(2*GAMMA2)) {
        r -= 2*GAMMA2;
    // Use Barrett reduction to calculate r0 = r % A. The
    // code calculates:
    //     floor(a/A) = floor( (a * R) / 2^M)
    //       where,
    //          M is so that 2^M>= A^2
    //          r = floor(2^M / A)
    static const uint32_t M = 36;
    // Precomputed reciprocal r = floor((2^36) / 190464
    static const uint32_t R = 360800;
    // As per spec ALPHA, A = 2*GAMMA2 = (Q-1)/88 * 2 (Dilithium2)
    static const uint32_t A = 2*GAMMA2;

    // a0
    int32_t r;
    int32_t v,w,z;
    // Barrett reduction:
    //   a0' = a mod A = a - A*floor((a*r) / 2^M)
    r = (int32_t)((((uint64_t)a)*R) >> M);
    r = a - r*A;

    v = ((A-r)>>31) & 1;
    w = ((GAMMA2 - r)>>31) & 1;
    z = (((A + GAMMA2) -r) >> 31) & 1;
 //    printf("%d %d %d\n", v,w,z);
    *a0 = r - (((!z)&(v|w))*A) - (z)*2*A;

    /*
    // REDC
    if (r>(int32_t)A) {
        r -= A;
    }

    r1 = ((int32_t)r)*2*GAMMA2;

    // centrize
    if (r > GAMMA2) {
        *a0 = (int32_t)r - 2*GAMMA2;
    if (r > (int32_t)GAMMA2) {
        *a0 = (int32_t)r - A;
    } else {
        *a0 = r;
    }
    */

    // a1
    uint64_t a2 = a - *a0;

    // OLD
    a1  = (a + 127) >> 7;
    a1  = (a1 * 11275 + (1 << 23)) >> 24;
    a1 ^= ((43 - a1) >> 31) & a1;

    v = ((int32_t)a2 - Q + 1);
    //a2 = (!v);
    *a0 -= !v;
    a2 = (!!v)*a2;
    // CASE: r-r0 = q-1 => r1=0, r0 = r0-1
    uint64_t a2 = (uint64_t)a - *a0;
    #if 0
    if (a2 == (Q-1)) {
        a2 = 0;
        *a0--;
        *a0 = *a0-1;
    }

    // divide (r-r0)/alpha
    // int32_t a2 = ((uint64_t)a-*a0)/(2*GAMMA2);
    if ( (a2 >= (2*GAMMA2))) {
        a2 = (a2*u) >> 36;
        // a2 is divisible by ALPHA=(2*GAMMA2) and hence
    #endif

    // divide (r-r0)/A
    // int32_t a2 = ((uint64_t)a-*a0)/(A);
    v = ((int32_t)a2-A) >> 31;
    a2 = (!v)*(((a2*R) >> M) + 1) + v*a2;
    /*
    if ( (a2 >= (A))) {
        a2 = (a2*R) >> M;
        // a2 is divisible by ALPHA=(A) and hence
        // it will always be off by one.
        a2++;
    }
    */
    return a2;
 }



    //if (!a1) a2 = a1;

    //*a0  = a - a1 * 2 * GAMMA2;
    //*a0 -= (((Q - 1) / 2 - *a0) >> 31) & Q;
    if (a1 != (int32_t)a2)
        printf("OZAPTF: (A1=%d, A2=%d, A=%d R=%d)\n",
            a1, (int32_t)a2, a, (a-(*a0)));
 //    printf("OZAPTF: %d %d %d\n", a, *a0, (a-*a0));
 int32_t PQCLEAN_DILITHIUM2_CLEAN_decompose(int32_t *a0, int32_t a) {
    /* TODO:
        a % Q is skipped, as it seems a<Q always. In case this
        needs to be done, then we can use a fact that Q is a
        Generalized Marsenne Prime, so modular redc is fast
        (see work by Jerome Solina and Crandall '92 algo).
    */

    // Use Barrett reduction to calculate r0 = r % A. The
    // code calculates:
    //     floor(a/A) = floor( (a * R) / 2^M)
    //       where,
    //          M is so that 2^M>= A^2
    //          r = floor(2^M / A)
    static const uint32_t M = 36;
    // Precomputed reciprocal r = floor((2^36) / 190464
    static const uint32_t R = 360800;
    // As per spec ALPHA, A = 2*GAMMA2 = (Q-1)/88 * 2 (Dilithium2)
    static const uint32_t A = 2*GAMMA2;

    // a0
    int32_t r;
    int32_t v,w,z;
    // Barrett reduction:
    //   a0' = a mod A = a - A*floor((a*r) / 2^M)
    r = (int32_t)((((uint64_t)a)*R) >> M);
    r = a - r*A;

    v = ((A-r)>>31) & 1;
    w = ((GAMMA2 - r)>>31) & 1;
    z = (((A + GAMMA2) -r) >> 31) & 1;
    *a0 = r - (((!z)&(v|w))*A) - (z)*2*A;

    // a1
    uint64_t a2 = a - *a0;
    v = ((int32_t)a2 - Q + 1);
    *a0 -= !v;
    a2 = (!!v)*a2;
    v = ((int32_t)a2-A) >> 31;
    a2 = (!v)*(((a2*R) >> M) + 1) + v*a2;
    return a2;
 }