optimized

This commit is contained in:
rcxpony 2025-03-15 05:23:11 +05:00
parent 68472fc649
commit 3634311467
5 changed files with 38 additions and 51 deletions

View File

@ -1,30 +1,30 @@
#include <edsign.cuh> #include <edsign.cuh>
#include <ed25519.cuh> #include <ed25519.cuh>
#include <sha512.cuh> #include <sha512.cuh>
__device__ void expand_key(unsigned char* expanded, const unsigned char* secret) { __device__ __forceinline__ void expand_key(unsigned char* expanded, const unsigned char* secret) {
struct sha512_state s; struct sha512_state s;
memcpy(&s, &sha512_initial_state, sizeof(s)); memcpy(&s, &sha512_initial_state, sizeof(s));
sha512_final(&s, secret); sha512_final(&s, secret);
sha512_get(&s, expanded); sha512_get(&s, expanded);
ed25519_prepare(expanded); ed25519_prepare(expanded);
} }
__device__ void pp(unsigned char* packed, const struct ed25519_pt* p) { __device__ __forceinline__ void pp(unsigned char* packed, const struct ed25519_pt* p) {
unsigned char x[32], y[32]; unsigned char x[32], y[32];
ed25519_unproject(x, y, p); ed25519_unproject(x, y, p);
ed25519_pack(packed, x, y); ed25519_pack(packed, x, y);
} }
__device__ void sm_pack(unsigned char* r, const unsigned char* k) { __device__ __forceinline__ void sm_pack(unsigned char* r, const unsigned char* k) {
struct ed25519_pt p; struct ed25519_pt p;
ed25519_smult(&p, &ed25519_base, k); ed25519_smult(&p, &ed25519_base, k);
pp(r, &p); pp(r, &p);
} }
__device__ void edsign_sec_to_pub(unsigned char* pub, const unsigned char* secret) { __device__ __forceinline__ void edsign_sec_to_pub(unsigned char* pub, const unsigned char* secret) {
unsigned char expanded[64]; unsigned char expanded[64];
expand_key(expanded, secret); expand_key(expanded, secret);
sm_pack(pub, expanded); sm_pack(pub, expanded);
} }
__device__ void compact_wipe(void* __restrict__ data) { __device__ __forceinline__ void compact_wipe(void* __restrict__ data) {
volatile unsigned char* p = (volatile unsigned char*)data; unsigned char* p = (unsigned char*)data;
unsigned long i = 0; unsigned long i = 0;
#pragma unroll #pragma unroll
for (; i + 3 < 32; i += 4) { for (; i + 3 < 32; i += 4) {

View File

@ -1,9 +1,9 @@
#ifndef __EDSIGN_CUH #ifndef __EDSIGN_CUH
#define __EDSIGN_CUH #define __EDSIGN_CUH
__device__ void expand_key(unsigned char* expanded, const unsigned char* secret); __device__ __forceinline__ void expand_key(unsigned char* expanded, const unsigned char* secret);
__device__ void pp(unsigned char* packed, const struct ed25519_pt* p); __device__ __forceinline__ void pp(unsigned char* packed, const struct ed25519_pt* p);
__device__ void sm_pack(unsigned char* r, const unsigned char* k); __device__ __forceinline__ void sm_pack(unsigned char* r, const unsigned char* k);
__device__ void edsign_sec_to_pub(unsigned char* pub, const unsigned char* secret); __device__ __forceinline__ void edsign_sec_to_pub(unsigned char* pub, const unsigned char* secret);
__device__ void compact_wipe(void* __restrict__ data); __device__ __forceinline__ void compact_wipe(void* __restrict__ data);
__device__ void ed25519_keygen(unsigned char private_key[64], unsigned char public_key[32], unsigned char random_seed[32]); __device__ void ed25519_keygen(unsigned char private_key[64], unsigned char public_key[32], unsigned char random_seed[32]);
#endif #endif

View File

@ -1,6 +1,5 @@
#include <f25519.cuh> #include <f25519.cuh>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <stdint.h>
__device__ void f25519_copy(unsigned char* __restrict__ x, const unsigned char* __restrict__ a) { __device__ void f25519_copy(unsigned char* __restrict__ x, const unsigned char* __restrict__ a) {
const uint4* src = reinterpret_cast<const uint4*>(a); const uint4* src = reinterpret_cast<const uint4*>(a);
uint4* dst = reinterpret_cast<uint4*>(x); uint4* dst = reinterpret_cast<uint4*>(x);
@ -24,11 +23,12 @@ __device__ void f25519_select(unsigned char* __restrict__ dst, const unsigned ch
d[0] = res0; d[0] = res0;
d[1] = res1; d[1] = res1;
} }
__device__ void f25519_normalize(unsigned char* __restrict__ x) { __device__ void f25519_normalize(unsigned char* __restrict__ x) {
__align__(32) unsigned char minusp[32]; __align__(32) unsigned char minusp[32];
unsigned c = (x[31] >> 7) * 19; unsigned c = (x[31] >> 7) * 19;
x[31] &= 127; x[31] &= 127;
#pragma unroll 32 #pragma unroll
for (int i = 0; i < 32; i++) { for (int i = 0; i < 32; i++) {
c += x[i]; c += x[i];
x[i] = (unsigned char)c; x[i] = (unsigned char)c;
@ -36,7 +36,7 @@ __device__ void f25519_normalize(unsigned char* __restrict__ x) {
} }
c = 19; c = 19;
#pragma unroll #pragma unroll
for (int i = 0; i + 1 < 32; i++) { for (int i = 0; i < 31; i++) {
c += x[i]; c += x[i];
minusp[i] = (unsigned char)c; minusp[i] = (unsigned char)c;
c >>= 8; c >>= 8;
@ -47,89 +47,81 @@ __device__ void f25519_normalize(unsigned char* __restrict__ x) {
} }
__device__ void f25519_add(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) { __device__ void f25519_add(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) {
unsigned c = 0; unsigned c = 0;
#pragma unroll 32 #pragma unroll
for (int i = 0; i < 32; i++) { for (int i = 0; i < 32; i++) {
c = (c >> 8) + ((unsigned)a[i]) + ((unsigned)b[i]); c = (c >> 8) + ((unsigned)a[i]) + ((unsigned)b[i]);
r[i] = (unsigned char)c; r[i] = (unsigned char)c;
} }
r[31] &= 127; r[31] &= 127;
c = (c >> 7) * 19; c = (c >> 7) * 19;
#pragma unroll 32 #pragma unroll
for (int i = 0; i < 32; i++) { for (int i = 0; i < 32; i++) {
c += r[i]; c += r[i];
r[i] = (unsigned char)c; r[i] = (unsigned char)c;
c >>= 8; c >>= 8;
} }
} }
__device__ void f25519_sub(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) { __device__ void f25519_sub(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) {
unsigned c = 218; unsigned c = 218;
int i = 0;
#pragma unroll #pragma unroll
for (i = 0; i + 1 < 32; i++) { for (int i = 0; i < 31; i++) {
c += 65280 + ((unsigned)a[i]) - ((unsigned)b[i]); c += 65280 + ((unsigned)a[i]) - ((unsigned)b[i]);
r[i] = (unsigned char)c; r[i] = (unsigned char)c;
c >>= 8; c >>= 8;
} }
c += ((unsigned)a[i]) - ((unsigned)b[i]); c += ((unsigned)a[31]) - ((unsigned)b[31]);
r[i] = (unsigned char)(c & 127); r[31] = (unsigned char)(c & 127);
c = (c >> 7) * 19; c = (c >> 7) * 19;
#pragma unroll 32 #pragma unroll
for (i = 0; i < 32; i++) { for (int i = 0; i < 32; i++) {
c += r[i]; c += r[i];
r[i] = (unsigned char)c; r[i] = (unsigned char)c;
c >>= 8; c >>= 8;
} }
} }
__device__ void f25519_neg(unsigned char* __restrict__ r, const unsigned char* __restrict__ a) { __device__ void f25519_neg(unsigned char* __restrict__ r, const unsigned char* __restrict__ a) {
unsigned c = 218; unsigned c = 218;
int i = 0;
#pragma unroll #pragma unroll
for (i = 0; i + 1 < 32; i++) { for (int i = 0; i < 31; i++) {
c += 65280 - ((unsigned)a[i]); c += 65280 - ((unsigned)a[i]);
r[i] = (unsigned char)c; r[i] = (unsigned char)c;
c >>= 8; c >>= 8;
} }
c -= ((unsigned)a[i]); c -= ((unsigned)a[31]);
r[i] = (unsigned char)(c & 127); r[31] = (unsigned char)(c & 127);
c = (c >> 7) * 19; c = (c >> 7) * 19;
#pragma unroll 32 #pragma unroll
for (i = 0; i < 32; i++) { for (int i = 0; i < 32; i++) {
c += r[i]; c += r[i];
r[i] = (unsigned char)c; r[i] = (unsigned char)c;
c >>= 8; c >>= 8;
} }
} }
__device__ void f25519_mul__distinct(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) { __device__ void f25519_mul__distinct(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) {
unsigned c = 0; unsigned c = 0;
#pragma unroll 32 #pragma unroll
for (int i = 0; i < 32; i++) { for (int i = 0; i < 32; i++) {
c >>= 8; c >>= 8;
for (int j = 0; j <= i; j++) { for (int j = 0; j <= i; j++)
c += ((unsigned)a[j]) * ((unsigned)b[i - j]); c += ((unsigned)a[j]) * ((unsigned)b[i - j]);
} for (int j = i + 1; j < 32; j++)
for (int j = i + 1; j < 32; j++) {
c += ((unsigned)a[j]) * ((unsigned)b[32 + i - j]) * 38; c += ((unsigned)a[j]) * ((unsigned)b[32 + i - j]) * 38;
}
r[i] = (unsigned char)c; r[i] = (unsigned char)c;
} }
r[31] &= 127; r[31] &= 127;
c = (c >> 7) * 19; c = (c >> 7) * 19;
#pragma unroll 32 #pragma unroll
for (int i = 0; i < 32; i++) { for (int i = 0; i < 32; i++) {
c += r[i]; c += r[i];
r[i] = (unsigned char)c; r[i] = (unsigned char)c;
c >>= 8; c >>= 8;
} }
} }
__device__ void f25519_inv__distinct(unsigned char* __restrict__ r, const unsigned char* __restrict__ x) { __device__ void f25519_inv__distinct(unsigned char* __restrict__ r, const unsigned char* __restrict__ x) {
__align__(32) unsigned char s[32]; __align__(32) unsigned char s[32];
f25519_mul__distinct(s, x, x); f25519_mul__distinct(s, x, x);
f25519_mul__distinct(r, s, x); f25519_mul__distinct(r, s, x);
#pragma unroll 248 #pragma unroll
for (int i = 0; i < 248; i++) { for (int i = 0; i < 248; i++) {
f25519_mul__distinct(s, r, r); f25519_mul__distinct(s, r, r);
f25519_mul__distinct(r, s, x); f25519_mul__distinct(r, s, x);

View File

@ -28,14 +28,10 @@ __device__ __constant__ unsigned long round_k[80] = {
0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL, 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL, 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL, 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL,
}; };
__device__ __forceinline__ unsigned long load64(const unsigned char* __restrict__ x) { __device__ __forceinline__ unsigned long load64(const unsigned char* __restrict__ x) {
return (static_cast<unsigned long>(x[0]) << 56) | return (static_cast<unsigned long>(x[0]) << 56) | (static_cast<unsigned long>(x[1]) << 48) |
(static_cast<unsigned long>(x[1]) << 48) | (static_cast<unsigned long>(x[2]) << 40) | (static_cast<unsigned long>(x[3]) << 32) |
(static_cast<unsigned long>(x[2]) << 40) | (static_cast<unsigned long>(x[4]) << 24) | (static_cast<unsigned long>(x[5]) << 16) |
(static_cast<unsigned long>(x[3]) << 32) | (static_cast<unsigned long>(x[6]) << 8) | (static_cast<unsigned long>(x[7]));
(static_cast<unsigned long>(x[4]) << 24) |
(static_cast<unsigned long>(x[5]) << 16) |
(static_cast<unsigned long>(x[6]) << 8) |
(static_cast<unsigned long>(x[7]));
} }
__device__ __forceinline__ void store64(unsigned char* __restrict__ x, unsigned long v) { __device__ __forceinline__ void store64(unsigned char* __restrict__ x, unsigned long v) {
#pragma unroll 8 #pragma unroll 8

View File

@ -18,8 +18,7 @@ __device__ int parameters(const char* arg) noexcept {
extract_substring(arg, substr_start, sub_arg, 256); extract_substring(arg, substr_start, sub_arg, 256);
if (cstring_find(arg, "--altitude") != -1 || cstring_find(arg, "-a") != -1) { if (cstring_find(arg, "--altitude") != -1 || cstring_find(arg, "-a") != -1) {
unsigned tmp_high; unsigned tmp_high;
if (cstring_to_ull(sub_arg, &tmp_high) != 0) if (cstring_to_ull(sub_arg, &tmp_high) != 0) return 1;
return 1;
d_high = tmp_high; d_high = tmp_high;
} }
return 0; return 0;
@ -91,7 +90,7 @@ __global__ void KeyGen(curandState* randStates) {
} }
} }
int main(int argc, char* argv[]) { int main(int argc, char* argv[]) {
const int thPerBlock = 128; const int thPerBlock = 256;
int* d_result, mBpSM, h_high; int* d_result, mBpSM, h_high;
char** d_argv; char** d_argv;
cudaDeviceProp prop; cudaDeviceProp prop;