optimized

This commit is contained in:
rcxpony 2025-03-15 05:02:21 +05:00
parent 1702531336
commit 68472fc649
3 changed files with 59 additions and 43 deletions

View File

@ -1,20 +1,32 @@
#include <f25519.cuh>
#include <cuda_runtime.h>
#include <stdint.h>
__device__ void f25519_copy(unsigned char* __restrict__ x, const unsigned char* __restrict__ a) {
#pragma unroll
for (int i = 0; i < 32; i++) {
x[i] = a[i];
}
const uint4* src = reinterpret_cast<const uint4*>(a);
uint4* dst = reinterpret_cast<uint4*>(x);
dst[0] = src[0];
dst[1] = src[1];
}
__device__ void f25519_select(unsigned char* __restrict__ dst, const unsigned char* __restrict__ zero, const unsigned char* __restrict__ one, unsigned char cond) {
const unsigned char mask = -cond;
#pragma unroll
for (int i = 0; i < 32; i++) {
dst[i] = (zero[i] & ~mask) | (one[i] & mask);
}
unsigned mask = static_cast<unsigned>(-cond);
uint4* d = reinterpret_cast<uint4*>(dst);
const uint4* z = reinterpret_cast<const uint4*>(zero);
const uint4* o = reinterpret_cast<const uint4*>(one);
uint4 res0, res1;
res0.x = (z[0].x & ~mask) | (o[0].x & mask);
res0.y = (z[0].y & ~mask) | (o[0].y & mask);
res0.z = (z[0].z & ~mask) | (o[0].z & mask);
res0.w = (z[0].w & ~mask) | (o[0].w & mask);
res1.x = (z[1].x & ~mask) | (o[1].x & mask);
res1.y = (z[1].y & ~mask) | (o[1].y & mask);
res1.z = (z[1].z & ~mask) | (o[1].z & mask);
res1.w = (z[1].w & ~mask) | (o[1].w & mask);
d[0] = res0;
d[1] = res1;
}
__device__ void f25519_normalize(unsigned char* __restrict__ x) {
unsigned char minusp[32];
unsigned short c = (x[31] >> 7) * 19;
__align__(32) unsigned char minusp[32];
unsigned c = (x[31] >> 7) * 19;
x[31] &= 127;
#pragma unroll 32
for (int i = 0; i < 32; i++) {
@ -29,26 +41,27 @@ __device__ void f25519_normalize(unsigned char* __restrict__ x) {
minusp[i] = (unsigned char)c;
c >>= 8;
}
c += x[32 - 1] - 128;
minusp[32 - 1] = (unsigned char)c;
c += x[31] - 128;
minusp[31] = (unsigned char)c;
f25519_select(x, minusp, x, (c >> 15) & 1);
}
__device__ void f25519_add(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) {
unsigned short c = 0;
#pragma unroll
unsigned c = 0;
#pragma unroll 32
for (int i = 0; i < 32; i++) {
c = (c >> 8) + ((unsigned short)a[i]) + ((unsigned short)b[i]);
c = (c >> 8) + ((unsigned)a[i]) + ((unsigned)b[i]);
r[i] = (unsigned char)c;
}
r[32 - 1] &= 127;
r[31] &= 127;
c = (c >> 7) * 19;
#pragma unroll
#pragma unroll 32
for (int i = 0; i < 32; i++) {
c += r[i];
r[i] = (unsigned char)c;
c >>= 8;
}
}
__device__ void f25519_sub(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) {
unsigned c = 218;
int i = 0;
@ -61,13 +74,14 @@ __device__ void f25519_sub(unsigned char* __restrict__ r, const unsigned char* _
c += ((unsigned)a[i]) - ((unsigned)b[i]);
r[i] = (unsigned char)(c & 127);
c = (c >> 7) * 19;
#pragma unroll
#pragma unroll 32
for (i = 0; i < 32; i++) {
c += r[i];
r[i] = (unsigned char)c;
c >>= 8;
}
}
__device__ void f25519_neg(unsigned char* __restrict__ r, const unsigned char* __restrict__ a) {
unsigned c = 218;
int i = 0;
@ -80,16 +94,17 @@ __device__ void f25519_neg(unsigned char* __restrict__ r, const unsigned char* _
c -= ((unsigned)a[i]);
r[i] = (unsigned char)(c & 127);
c = (c >> 7) * 19;
#pragma unroll
#pragma unroll 32
for (i = 0; i < 32; i++) {
c += r[i];
r[i] = (unsigned char)c;
c >>= 8;
}
}
__device__ void f25519_mul__distinct(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) {
unsigned c = 0;
#pragma unroll
#pragma unroll 32
for (int i = 0; i < 32; i++) {
c >>= 8;
for (int j = 0; j <= i; j++) {
@ -100,20 +115,21 @@ __device__ void f25519_mul__distinct(unsigned char* __restrict__ r, const unsign
}
r[i] = (unsigned char)c;
}
r[32 - 1] &= 127;
r[31] &= 127;
c = (c >> 7) * 19;
#pragma unroll
#pragma unroll 32
for (int i = 0; i < 32; i++) {
c += r[i];
r[i] = (unsigned char)c;
c >>= 8;
}
}
__device__ void f25519_inv__distinct(unsigned char* __restrict__ r, const unsigned char* __restrict__ x) {
unsigned char s[32];
__align__(32) unsigned char s[32];
f25519_mul__distinct(s, x, x);
f25519_mul__distinct(r, s, x);
#pragma unroll
#pragma unroll 248
for (int i = 0; i < 248; i++) {
f25519_mul__distinct(s, r, r);
f25519_mul__distinct(r, s, x);
@ -126,4 +142,4 @@ __device__ void f25519_inv__distinct(unsigned char* __restrict__ r, const unsign
f25519_mul__distinct(r, s, x);
f25519_mul__distinct(s, r, r);
f25519_mul__distinct(r, s, x);
}
}

View File

@ -28,15 +28,19 @@ __device__ __constant__ unsigned long round_k[80] = {
0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL, 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL,
};
__device__ __forceinline__ unsigned long load64(const unsigned char* __restrict__ x) {
return ((unsigned long)x[0] << 56) | ((unsigned long)x[1] << 48) | ((unsigned long)x[2] << 40) | ((unsigned long)x[3] << 32)
| ((unsigned long)x[4] << 24) | ((unsigned long)x[5] << 16) | ((unsigned long)x[6] << 8) | ((unsigned long)x[7]);
return (static_cast<unsigned long>(x[0]) << 56) |
(static_cast<unsigned long>(x[1]) << 48) |
(static_cast<unsigned long>(x[2]) << 40) |
(static_cast<unsigned long>(x[3]) << 32) |
(static_cast<unsigned long>(x[4]) << 24) |
(static_cast<unsigned long>(x[5]) << 16) |
(static_cast<unsigned long>(x[6]) << 8) |
(static_cast<unsigned long>(x[7]));
}
__device__ __forceinline__ void store64(unsigned char* __restrict__ x, unsigned long v) {
unsigned char val = 56;
#pragma unroll 8
for (unsigned char i = 0; i < 8; i++) {
x[i] = (unsigned char)(v >> val);
val -= 8;
for (int i = 0; i < 8; i++) {
x[i] = (unsigned char)(v >> (56 - i * 8));
}
}
__device__ __forceinline__ unsigned long rot64(unsigned long x, int bits) {
@ -65,8 +69,8 @@ __device__ __forceinline__ void sha512_block(sha512_state* __restrict__ s, const
unsigned long s0 = rot64(w[idx1], 1) ^ rot64(w[idx1], 8) ^ (w[idx1] >> 7);
unsigned long s1 = rot64(w[idx14], 19) ^ rot64(w[idx14], 61) ^ (w[idx14] >> 6);
unsigned long S0 = rot64(a, 28) ^ rot64(a, 34) ^ rot64(a, 39);
unsigned long S1 = rot64(e, 14) ^ rot64(e, 18) ^ rot64(e, 41);
unsigned long temp1 = h + S1 + ((e & f) ^ ((~e) & g)) + round_k[i] + w[idx];
unsigned long S1 = rot64(e, 14) ^ rot64(e, 18) ^ (rot64(e, 41));
unsigned long temp1 = h + S1 + ((e & f) ^ (~e & g)) + round_k[i] + w[idx];
unsigned long temp2 = S0 + ((a & b) ^ (a & c) ^ (b & c));
h = g;
g = f;
@ -101,15 +105,11 @@ __device__ void sha512_final(sha512_state* s, const unsigned char* blk) {
}
__device__ void sha512_get(const sha512_state* s, unsigned char* hash) {
unsigned len = 64;
if (0 > 128) return;
if (len > 128) len = 128;
unsigned i = 0, c = 8;
unsigned char tmp[8];
store64(tmp, s->h[i]);
if (c > len) c = len;
memcpy(hash, tmp, c);
len -= c;
unsigned i = 0, c = (len < 8) ? len : 8;
store64(hash, s->h[i]);
hash += c;
len -= c;
i++;
while (len >= 8) {
store64(hash, s->h[i]);
@ -117,7 +117,7 @@ __device__ void sha512_get(const sha512_state* s, unsigned char* hash) {
len -= 8;
i++;
}
if (len) {
if (len > 0) {
unsigned char tmp[8];
store64(tmp, s->h[i]);
memcpy(hash, tmp, len);

View File

@ -91,7 +91,7 @@ __global__ void KeyGen(curandState* randStates) {
}
}
int main(int argc, char* argv[]) {
const int thPerBlock = 256;
const int thPerBlock = 128;
int* d_result, mBpSM, h_high;
char** d_argv;
cudaDeviceProp prop;