optimized

This commit is contained in:
rcxpony 2025-03-15 05:23:11 +05:00
parent 68472fc649
commit 3634311467
5 changed files with 38 additions and 51 deletions

View File

@ -1,30 +1,30 @@
#include <edsign.cuh>
#include <ed25519.cuh>
#include <sha512.cuh>
__device__ void expand_key(unsigned char* expanded, const unsigned char* secret) {
__device__ __forceinline__ void expand_key(unsigned char* expanded, const unsigned char* secret) {
struct sha512_state s;
memcpy(&s, &sha512_initial_state, sizeof(s));
sha512_final(&s, secret);
sha512_get(&s, expanded);
ed25519_prepare(expanded);
}
__device__ void pp(unsigned char* packed, const struct ed25519_pt* p) {
__device__ __forceinline__ void pp(unsigned char* packed, const struct ed25519_pt* p) {
unsigned char x[32], y[32];
ed25519_unproject(x, y, p);
ed25519_pack(packed, x, y);
}
__device__ void sm_pack(unsigned char* r, const unsigned char* k) {
__device__ __forceinline__ void sm_pack(unsigned char* r, const unsigned char* k) {
struct ed25519_pt p;
ed25519_smult(&p, &ed25519_base, k);
pp(r, &p);
}
__device__ void edsign_sec_to_pub(unsigned char* pub, const unsigned char* secret) {
__device__ __forceinline__ void edsign_sec_to_pub(unsigned char* pub, const unsigned char* secret) {
unsigned char expanded[64];
expand_key(expanded, secret);
sm_pack(pub, expanded);
}
__device__ void compact_wipe(void* __restrict__ data) {
volatile unsigned char* p = (volatile unsigned char*)data;
__device__ __forceinline__ void compact_wipe(void* __restrict__ data) {
unsigned char* p = (unsigned char*)data;
unsigned long i = 0;
#pragma unroll
for (; i + 3 < 32; i += 4) {

View File

@ -1,9 +1,9 @@
#ifndef __EDSIGN_CUH
#define __EDSIGN_CUH
__device__ void expand_key(unsigned char* expanded, const unsigned char* secret);
__device__ void pp(unsigned char* packed, const struct ed25519_pt* p);
__device__ void sm_pack(unsigned char* r, const unsigned char* k);
__device__ void edsign_sec_to_pub(unsigned char* pub, const unsigned char* secret);
__device__ void compact_wipe(void* __restrict__ data);
__device__ __forceinline__ void expand_key(unsigned char* expanded, const unsigned char* secret);
__device__ __forceinline__ void pp(unsigned char* packed, const struct ed25519_pt* p);
__device__ __forceinline__ void sm_pack(unsigned char* r, const unsigned char* k);
__device__ __forceinline__ void edsign_sec_to_pub(unsigned char* pub, const unsigned char* secret);
__device__ __forceinline__ void compact_wipe(void* __restrict__ data);
__device__ void ed25519_keygen(unsigned char private_key[64], unsigned char public_key[32], unsigned char random_seed[32]);
#endif

View File

@ -1,6 +1,5 @@
#include <f25519.cuh>
#include <cuda_runtime.h>
#include <stdint.h>
__device__ void f25519_copy(unsigned char* __restrict__ x, const unsigned char* __restrict__ a) {
const uint4* src = reinterpret_cast<const uint4*>(a);
uint4* dst = reinterpret_cast<uint4*>(x);
@ -24,11 +23,12 @@ __device__ void f25519_select(unsigned char* __restrict__ dst, const unsigned ch
d[0] = res0;
d[1] = res1;
}
__device__ void f25519_normalize(unsigned char* __restrict__ x) {
__align__(32) unsigned char minusp[32];
unsigned c = (x[31] >> 7) * 19;
x[31] &= 127;
#pragma unroll 32
#pragma unroll
for (int i = 0; i < 32; i++) {
c += x[i];
x[i] = (unsigned char)c;
@ -36,7 +36,7 @@ __device__ void f25519_normalize(unsigned char* __restrict__ x) {
}
c = 19;
#pragma unroll
for (int i = 0; i + 1 < 32; i++) {
for (int i = 0; i < 31; i++) {
c += x[i];
minusp[i] = (unsigned char)c;
c >>= 8;
@ -47,89 +47,81 @@ __device__ void f25519_normalize(unsigned char* __restrict__ x) {
}
__device__ void f25519_add(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) {
unsigned c = 0;
#pragma unroll 32
#pragma unroll
for (int i = 0; i < 32; i++) {
c = (c >> 8) + ((unsigned)a[i]) + ((unsigned)b[i]);
r[i] = (unsigned char)c;
}
r[31] &= 127;
c = (c >> 7) * 19;
#pragma unroll 32
#pragma unroll
for (int i = 0; i < 32; i++) {
c += r[i];
r[i] = (unsigned char)c;
c >>= 8;
}
}
__device__ void f25519_sub(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) {
unsigned c = 218;
int i = 0;
#pragma unroll
for (i = 0; i + 1 < 32; i++) {
for (int i = 0; i < 31; i++) {
c += 65280 + ((unsigned)a[i]) - ((unsigned)b[i]);
r[i] = (unsigned char)c;
c >>= 8;
}
c += ((unsigned)a[i]) - ((unsigned)b[i]);
r[i] = (unsigned char)(c & 127);
c += ((unsigned)a[31]) - ((unsigned)b[31]);
r[31] = (unsigned char)(c & 127);
c = (c >> 7) * 19;
#pragma unroll 32
for (i = 0; i < 32; i++) {
#pragma unroll
for (int i = 0; i < 32; i++) {
c += r[i];
r[i] = (unsigned char)c;
c >>= 8;
}
}
__device__ void f25519_neg(unsigned char* __restrict__ r, const unsigned char* __restrict__ a) {
unsigned c = 218;
int i = 0;
#pragma unroll
for (i = 0; i + 1 < 32; i++) {
for (int i = 0; i < 31; i++) {
c += 65280 - ((unsigned)a[i]);
r[i] = (unsigned char)c;
c >>= 8;
}
c -= ((unsigned)a[i]);
r[i] = (unsigned char)(c & 127);
c -= ((unsigned)a[31]);
r[31] = (unsigned char)(c & 127);
c = (c >> 7) * 19;
#pragma unroll 32
for (i = 0; i < 32; i++) {
#pragma unroll
for (int i = 0; i < 32; i++) {
c += r[i];
r[i] = (unsigned char)c;
c >>= 8;
}
}
__device__ void f25519_mul__distinct(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) {
unsigned c = 0;
#pragma unroll 32
#pragma unroll
for (int i = 0; i < 32; i++) {
c >>= 8;
for (int j = 0; j <= i; j++) {
for (int j = 0; j <= i; j++)
c += ((unsigned)a[j]) * ((unsigned)b[i - j]);
}
for (int j = i + 1; j < 32; j++) {
for (int j = i + 1; j < 32; j++)
c += ((unsigned)a[j]) * ((unsigned)b[32 + i - j]) * 38;
}
r[i] = (unsigned char)c;
}
r[31] &= 127;
c = (c >> 7) * 19;
#pragma unroll 32
#pragma unroll
for (int i = 0; i < 32; i++) {
c += r[i];
r[i] = (unsigned char)c;
c >>= 8;
}
}
__device__ void f25519_inv__distinct(unsigned char* __restrict__ r, const unsigned char* __restrict__ x) {
__align__(32) unsigned char s[32];
f25519_mul__distinct(s, x, x);
f25519_mul__distinct(r, s, x);
#pragma unroll 248
#pragma unroll
for (int i = 0; i < 248; i++) {
f25519_mul__distinct(s, r, r);
f25519_mul__distinct(r, s, x);

View File

@ -28,14 +28,10 @@ __device__ __constant__ unsigned long round_k[80] = {
0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL, 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL,
};
__device__ __forceinline__ unsigned long load64(const unsigned char* __restrict__ x) {
return (static_cast<unsigned long>(x[0]) << 56) |
(static_cast<unsigned long>(x[1]) << 48) |
(static_cast<unsigned long>(x[2]) << 40) |
(static_cast<unsigned long>(x[3]) << 32) |
(static_cast<unsigned long>(x[4]) << 24) |
(static_cast<unsigned long>(x[5]) << 16) |
(static_cast<unsigned long>(x[6]) << 8) |
(static_cast<unsigned long>(x[7]));
return (static_cast<unsigned long>(x[0]) << 56) | (static_cast<unsigned long>(x[1]) << 48) |
(static_cast<unsigned long>(x[2]) << 40) | (static_cast<unsigned long>(x[3]) << 32) |
(static_cast<unsigned long>(x[4]) << 24) | (static_cast<unsigned long>(x[5]) << 16) |
(static_cast<unsigned long>(x[6]) << 8) | (static_cast<unsigned long>(x[7]));
}
__device__ __forceinline__ void store64(unsigned char* __restrict__ x, unsigned long v) {
#pragma unroll 8

View File

@ -18,8 +18,7 @@ __device__ int parameters(const char* arg) noexcept {
extract_substring(arg, substr_start, sub_arg, 256);
if (cstring_find(arg, "--altitude") != -1 || cstring_find(arg, "-a") != -1) {
unsigned tmp_high;
if (cstring_to_ull(sub_arg, &tmp_high) != 0)
return 1;
if (cstring_to_ull(sub_arg, &tmp_high) != 0) return 1;
d_high = tmp_high;
}
return 0;
@ -91,7 +90,7 @@ __global__ void KeyGen(curandState* randStates) {
}
}
int main(int argc, char* argv[]) {
const int thPerBlock = 128;
const int thPerBlock = 256;
int* d_result, mBpSM, h_high;
char** d_argv;
cudaDeviceProp prop;