From 3634311467a073737bb0192022e65f15ed621a49 Mon Sep 17 00:00:00 2001 From: rcxpony Date: Sat, 15 Mar 2025 05:23:11 +0500 Subject: [PATCH] optimized --- libs/edsign.cu | 12 ++++++------ libs/edsign.cuh | 10 +++++----- libs/f25519.cu | 50 +++++++++++++++++++++---------------------------- libs/sha512.cu | 12 ++++-------- sources/main.cu | 5 ++--- 5 files changed, 38 insertions(+), 51 deletions(-) diff --git a/libs/edsign.cu b/libs/edsign.cu index 44078a5..3493a64 100644 --- a/libs/edsign.cu +++ b/libs/edsign.cu @@ -1,30 +1,30 @@ #include #include #include -__device__ void expand_key(unsigned char* expanded, const unsigned char* secret) { +__device__ __forceinline__ void expand_key(unsigned char* expanded, const unsigned char* secret) { struct sha512_state s; memcpy(&s, &sha512_initial_state, sizeof(s)); sha512_final(&s, secret); sha512_get(&s, expanded); ed25519_prepare(expanded); } -__device__ void pp(unsigned char* packed, const struct ed25519_pt* p) { +__device__ __forceinline__ void pp(unsigned char* packed, const struct ed25519_pt* p) { unsigned char x[32], y[32]; ed25519_unproject(x, y, p); ed25519_pack(packed, x, y); } -__device__ void sm_pack(unsigned char* r, const unsigned char* k) { +__device__ __forceinline__ void sm_pack(unsigned char* r, const unsigned char* k) { struct ed25519_pt p; ed25519_smult(&p, &ed25519_base, k); pp(r, &p); } -__device__ void edsign_sec_to_pub(unsigned char* pub, const unsigned char* secret) { +__device__ __forceinline__ void edsign_sec_to_pub(unsigned char* pub, const unsigned char* secret) { unsigned char expanded[64]; expand_key(expanded, secret); sm_pack(pub, expanded); } -__device__ void compact_wipe(void* __restrict__ data) { - volatile unsigned char* p = (volatile unsigned char*)data; +__device__ __forceinline__ void compact_wipe(void* __restrict__ data) { + unsigned char* p = (unsigned char*)data; unsigned long i = 0; #pragma unroll for (; i + 3 < 32; i += 4) { diff --git a/libs/edsign.cuh b/libs/edsign.cuh index 2ed17c2..dfaab5a 100644 --- a/libs/edsign.cuh +++ b/libs/edsign.cuh @@ -1,9 +1,9 @@ #ifndef __EDSIGN_CUH #define __EDSIGN_CUH -__device__ void expand_key(unsigned char* expanded, const unsigned char* secret); -__device__ void pp(unsigned char* packed, const struct ed25519_pt* p); -__device__ void sm_pack(unsigned char* r, const unsigned char* k); -__device__ void edsign_sec_to_pub(unsigned char* pub, const unsigned char* secret); -__device__ void compact_wipe(void* __restrict__ data); +__device__ __forceinline__ void expand_key(unsigned char* expanded, const unsigned char* secret); +__device__ __forceinline__ void pp(unsigned char* packed, const struct ed25519_pt* p); +__device__ __forceinline__ void sm_pack(unsigned char* r, const unsigned char* k); +__device__ __forceinline__ void edsign_sec_to_pub(unsigned char* pub, const unsigned char* secret); +__device__ __forceinline__ void compact_wipe(void* __restrict__ data); __device__ void ed25519_keygen(unsigned char private_key[64], unsigned char public_key[32], unsigned char random_seed[32]); #endif \ No newline at end of file diff --git a/libs/f25519.cu b/libs/f25519.cu index eb44d1a..43563d7 100644 --- a/libs/f25519.cu +++ b/libs/f25519.cu @@ -1,6 +1,5 @@ #include #include -#include __device__ void f25519_copy(unsigned char* __restrict__ x, const unsigned char* __restrict__ a) { const uint4* src = reinterpret_cast(a); uint4* dst = reinterpret_cast(x); @@ -24,11 +23,12 @@ __device__ void f25519_select(unsigned char* __restrict__ dst, const unsigned ch d[0] = res0; d[1] = res1; } + __device__ void f25519_normalize(unsigned char* __restrict__ x) { __align__(32) unsigned char minusp[32]; unsigned c = (x[31] >> 7) * 19; x[31] &= 127; -#pragma unroll 32 +#pragma unroll for (int i = 0; i < 32; i++) { c += x[i]; x[i] = (unsigned char)c; @@ -36,7 +36,7 @@ __device__ void f25519_normalize(unsigned char* __restrict__ x) { } c = 19; #pragma unroll - for (int i = 0; i + 1 < 32; i++) { + for (int i = 0; i < 31; i++) { c += x[i]; minusp[i] = (unsigned char)c; c >>= 8; @@ -47,89 +47,81 @@ __device__ void f25519_normalize(unsigned char* __restrict__ x) { } __device__ void f25519_add(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) { unsigned c = 0; -#pragma unroll 32 +#pragma unroll for (int i = 0; i < 32; i++) { c = (c >> 8) + ((unsigned)a[i]) + ((unsigned)b[i]); r[i] = (unsigned char)c; } r[31] &= 127; c = (c >> 7) * 19; -#pragma unroll 32 +#pragma unroll for (int i = 0; i < 32; i++) { c += r[i]; r[i] = (unsigned char)c; c >>= 8; } } - __device__ void f25519_sub(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) { unsigned c = 218; - int i = 0; #pragma unroll - for (i = 0; i + 1 < 32; i++) { + for (int i = 0; i < 31; i++) { c += 65280 + ((unsigned)a[i]) - ((unsigned)b[i]); r[i] = (unsigned char)c; c >>= 8; } - c += ((unsigned)a[i]) - ((unsigned)b[i]); - r[i] = (unsigned char)(c & 127); + c += ((unsigned)a[31]) - ((unsigned)b[31]); + r[31] = (unsigned char)(c & 127); c = (c >> 7) * 19; -#pragma unroll 32 - for (i = 0; i < 32; i++) { +#pragma unroll + for (int i = 0; i < 32; i++) { c += r[i]; r[i] = (unsigned char)c; c >>= 8; } } - __device__ void f25519_neg(unsigned char* __restrict__ r, const unsigned char* __restrict__ a) { unsigned c = 218; - int i = 0; #pragma unroll - for (i = 0; i + 1 < 32; i++) { + for (int i = 0; i < 31; i++) { c += 65280 - ((unsigned)a[i]); r[i] = (unsigned char)c; c >>= 8; } - c -= ((unsigned)a[i]); - r[i] = (unsigned char)(c & 127); + c -= ((unsigned)a[31]); + r[31] = (unsigned char)(c & 127); c = (c >> 7) * 19; -#pragma unroll 32 - for (i = 0; i < 32; i++) { +#pragma unroll + for (int i = 0; i < 32; i++) { c += r[i]; r[i] = (unsigned char)c; c >>= 8; } } - __device__ void f25519_mul__distinct(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) { unsigned c = 0; -#pragma unroll 32 +#pragma unroll for (int i = 0; i < 32; i++) { c >>= 8; - for (int j = 0; j <= i; j++) { + for (int j = 0; j <= i; j++) c += ((unsigned)a[j]) * ((unsigned)b[i - j]); - } - for (int j = i + 1; j < 32; j++) { + for (int j = i + 1; j < 32; j++) c += ((unsigned)a[j]) * ((unsigned)b[32 + i - j]) * 38; - } r[i] = (unsigned char)c; } r[31] &= 127; c = (c >> 7) * 19; -#pragma unroll 32 +#pragma unroll for (int i = 0; i < 32; i++) { c += r[i]; r[i] = (unsigned char)c; c >>= 8; } } - __device__ void f25519_inv__distinct(unsigned char* __restrict__ r, const unsigned char* __restrict__ x) { __align__(32) unsigned char s[32]; f25519_mul__distinct(s, x, x); f25519_mul__distinct(r, s, x); -#pragma unroll 248 +#pragma unroll for (int i = 0; i < 248; i++) { f25519_mul__distinct(s, r, r); f25519_mul__distinct(r, s, x); @@ -142,4 +134,4 @@ __device__ void f25519_inv__distinct(unsigned char* __restrict__ r, const unsign f25519_mul__distinct(r, s, x); f25519_mul__distinct(s, r, r); f25519_mul__distinct(r, s, x); -} +} \ No newline at end of file diff --git a/libs/sha512.cu b/libs/sha512.cu index 881d17f..b47954a 100644 --- a/libs/sha512.cu +++ b/libs/sha512.cu @@ -28,14 +28,10 @@ __device__ __constant__ unsigned long round_k[80] = { 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL, 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL, }; __device__ __forceinline__ unsigned long load64(const unsigned char* __restrict__ x) { - return (static_cast(x[0]) << 56) | - (static_cast(x[1]) << 48) | - (static_cast(x[2]) << 40) | - (static_cast(x[3]) << 32) | - (static_cast(x[4]) << 24) | - (static_cast(x[5]) << 16) | - (static_cast(x[6]) << 8) | - (static_cast(x[7])); + return (static_cast(x[0]) << 56) | (static_cast(x[1]) << 48) | + (static_cast(x[2]) << 40) | (static_cast(x[3]) << 32) | + (static_cast(x[4]) << 24) | (static_cast(x[5]) << 16) | + (static_cast(x[6]) << 8) | (static_cast(x[7])); } __device__ __forceinline__ void store64(unsigned char* __restrict__ x, unsigned long v) { #pragma unroll 8 diff --git a/sources/main.cu b/sources/main.cu index e3c30cb..4b87f08 100644 --- a/sources/main.cu +++ b/sources/main.cu @@ -18,8 +18,7 @@ __device__ int parameters(const char* arg) noexcept { extract_substring(arg, substr_start, sub_arg, 256); if (cstring_find(arg, "--altitude") != -1 || cstring_find(arg, "-a") != -1) { unsigned tmp_high; - if (cstring_to_ull(sub_arg, &tmp_high) != 0) - return 1; + if (cstring_to_ull(sub_arg, &tmp_high) != 0) return 1; d_high = tmp_high; } return 0; @@ -91,7 +90,7 @@ __global__ void KeyGen(curandState* randStates) { } } int main(int argc, char* argv[]) { - const int thPerBlock = 128; + const int thPerBlock = 256; int* d_result, mBpSM, h_high; char** d_argv; cudaDeviceProp prop;