optimized
This commit is contained in:
parent
1702531336
commit
68472fc649
@ -1,20 +1,32 @@
|
|||||||
#include <f25519.cuh>
|
#include <f25519.cuh>
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
#include <stdint.h>
|
||||||
__device__ void f25519_copy(unsigned char* __restrict__ x, const unsigned char* __restrict__ a) {
|
__device__ void f25519_copy(unsigned char* __restrict__ x, const unsigned char* __restrict__ a) {
|
||||||
#pragma unroll
|
const uint4* src = reinterpret_cast<const uint4*>(a);
|
||||||
for (int i = 0; i < 32; i++) {
|
uint4* dst = reinterpret_cast<uint4*>(x);
|
||||||
x[i] = a[i];
|
dst[0] = src[0];
|
||||||
}
|
dst[1] = src[1];
|
||||||
}
|
}
|
||||||
__device__ void f25519_select(unsigned char* __restrict__ dst, const unsigned char* __restrict__ zero, const unsigned char* __restrict__ one, unsigned char cond) {
|
__device__ void f25519_select(unsigned char* __restrict__ dst, const unsigned char* __restrict__ zero, const unsigned char* __restrict__ one, unsigned char cond) {
|
||||||
const unsigned char mask = -cond;
|
unsigned mask = static_cast<unsigned>(-cond);
|
||||||
#pragma unroll
|
uint4* d = reinterpret_cast<uint4*>(dst);
|
||||||
for (int i = 0; i < 32; i++) {
|
const uint4* z = reinterpret_cast<const uint4*>(zero);
|
||||||
dst[i] = (zero[i] & ~mask) | (one[i] & mask);
|
const uint4* o = reinterpret_cast<const uint4*>(one);
|
||||||
}
|
uint4 res0, res1;
|
||||||
|
res0.x = (z[0].x & ~mask) | (o[0].x & mask);
|
||||||
|
res0.y = (z[0].y & ~mask) | (o[0].y & mask);
|
||||||
|
res0.z = (z[0].z & ~mask) | (o[0].z & mask);
|
||||||
|
res0.w = (z[0].w & ~mask) | (o[0].w & mask);
|
||||||
|
res1.x = (z[1].x & ~mask) | (o[1].x & mask);
|
||||||
|
res1.y = (z[1].y & ~mask) | (o[1].y & mask);
|
||||||
|
res1.z = (z[1].z & ~mask) | (o[1].z & mask);
|
||||||
|
res1.w = (z[1].w & ~mask) | (o[1].w & mask);
|
||||||
|
d[0] = res0;
|
||||||
|
d[1] = res1;
|
||||||
}
|
}
|
||||||
__device__ void f25519_normalize(unsigned char* __restrict__ x) {
|
__device__ void f25519_normalize(unsigned char* __restrict__ x) {
|
||||||
unsigned char minusp[32];
|
__align__(32) unsigned char minusp[32];
|
||||||
unsigned short c = (x[31] >> 7) * 19;
|
unsigned c = (x[31] >> 7) * 19;
|
||||||
x[31] &= 127;
|
x[31] &= 127;
|
||||||
#pragma unroll 32
|
#pragma unroll 32
|
||||||
for (int i = 0; i < 32; i++) {
|
for (int i = 0; i < 32; i++) {
|
||||||
@ -29,26 +41,27 @@ __device__ void f25519_normalize(unsigned char* __restrict__ x) {
|
|||||||
minusp[i] = (unsigned char)c;
|
minusp[i] = (unsigned char)c;
|
||||||
c >>= 8;
|
c >>= 8;
|
||||||
}
|
}
|
||||||
c += x[32 - 1] - 128;
|
c += x[31] - 128;
|
||||||
minusp[32 - 1] = (unsigned char)c;
|
minusp[31] = (unsigned char)c;
|
||||||
f25519_select(x, minusp, x, (c >> 15) & 1);
|
f25519_select(x, minusp, x, (c >> 15) & 1);
|
||||||
}
|
}
|
||||||
__device__ void f25519_add(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) {
|
__device__ void f25519_add(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) {
|
||||||
unsigned short c = 0;
|
unsigned c = 0;
|
||||||
#pragma unroll
|
#pragma unroll 32
|
||||||
for (int i = 0; i < 32; i++) {
|
for (int i = 0; i < 32; i++) {
|
||||||
c = (c >> 8) + ((unsigned short)a[i]) + ((unsigned short)b[i]);
|
c = (c >> 8) + ((unsigned)a[i]) + ((unsigned)b[i]);
|
||||||
r[i] = (unsigned char)c;
|
r[i] = (unsigned char)c;
|
||||||
}
|
}
|
||||||
r[32 - 1] &= 127;
|
r[31] &= 127;
|
||||||
c = (c >> 7) * 19;
|
c = (c >> 7) * 19;
|
||||||
#pragma unroll
|
#pragma unroll 32
|
||||||
for (int i = 0; i < 32; i++) {
|
for (int i = 0; i < 32; i++) {
|
||||||
c += r[i];
|
c += r[i];
|
||||||
r[i] = (unsigned char)c;
|
r[i] = (unsigned char)c;
|
||||||
c >>= 8;
|
c >>= 8;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__device__ void f25519_sub(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) {
|
__device__ void f25519_sub(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) {
|
||||||
unsigned c = 218;
|
unsigned c = 218;
|
||||||
int i = 0;
|
int i = 0;
|
||||||
@ -61,13 +74,14 @@ __device__ void f25519_sub(unsigned char* __restrict__ r, const unsigned char* _
|
|||||||
c += ((unsigned)a[i]) - ((unsigned)b[i]);
|
c += ((unsigned)a[i]) - ((unsigned)b[i]);
|
||||||
r[i] = (unsigned char)(c & 127);
|
r[i] = (unsigned char)(c & 127);
|
||||||
c = (c >> 7) * 19;
|
c = (c >> 7) * 19;
|
||||||
#pragma unroll
|
#pragma unroll 32
|
||||||
for (i = 0; i < 32; i++) {
|
for (i = 0; i < 32; i++) {
|
||||||
c += r[i];
|
c += r[i];
|
||||||
r[i] = (unsigned char)c;
|
r[i] = (unsigned char)c;
|
||||||
c >>= 8;
|
c >>= 8;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__device__ void f25519_neg(unsigned char* __restrict__ r, const unsigned char* __restrict__ a) {
|
__device__ void f25519_neg(unsigned char* __restrict__ r, const unsigned char* __restrict__ a) {
|
||||||
unsigned c = 218;
|
unsigned c = 218;
|
||||||
int i = 0;
|
int i = 0;
|
||||||
@ -80,16 +94,17 @@ __device__ void f25519_neg(unsigned char* __restrict__ r, const unsigned char* _
|
|||||||
c -= ((unsigned)a[i]);
|
c -= ((unsigned)a[i]);
|
||||||
r[i] = (unsigned char)(c & 127);
|
r[i] = (unsigned char)(c & 127);
|
||||||
c = (c >> 7) * 19;
|
c = (c >> 7) * 19;
|
||||||
#pragma unroll
|
#pragma unroll 32
|
||||||
for (i = 0; i < 32; i++) {
|
for (i = 0; i < 32; i++) {
|
||||||
c += r[i];
|
c += r[i];
|
||||||
r[i] = (unsigned char)c;
|
r[i] = (unsigned char)c;
|
||||||
c >>= 8;
|
c >>= 8;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__device__ void f25519_mul__distinct(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) {
|
__device__ void f25519_mul__distinct(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) {
|
||||||
unsigned c = 0;
|
unsigned c = 0;
|
||||||
#pragma unroll
|
#pragma unroll 32
|
||||||
for (int i = 0; i < 32; i++) {
|
for (int i = 0; i < 32; i++) {
|
||||||
c >>= 8;
|
c >>= 8;
|
||||||
for (int j = 0; j <= i; j++) {
|
for (int j = 0; j <= i; j++) {
|
||||||
@ -100,20 +115,21 @@ __device__ void f25519_mul__distinct(unsigned char* __restrict__ r, const unsign
|
|||||||
}
|
}
|
||||||
r[i] = (unsigned char)c;
|
r[i] = (unsigned char)c;
|
||||||
}
|
}
|
||||||
r[32 - 1] &= 127;
|
r[31] &= 127;
|
||||||
c = (c >> 7) * 19;
|
c = (c >> 7) * 19;
|
||||||
#pragma unroll
|
#pragma unroll 32
|
||||||
for (int i = 0; i < 32; i++) {
|
for (int i = 0; i < 32; i++) {
|
||||||
c += r[i];
|
c += r[i];
|
||||||
r[i] = (unsigned char)c;
|
r[i] = (unsigned char)c;
|
||||||
c >>= 8;
|
c >>= 8;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__device__ void f25519_inv__distinct(unsigned char* __restrict__ r, const unsigned char* __restrict__ x) {
|
__device__ void f25519_inv__distinct(unsigned char* __restrict__ r, const unsigned char* __restrict__ x) {
|
||||||
unsigned char s[32];
|
__align__(32) unsigned char s[32];
|
||||||
f25519_mul__distinct(s, x, x);
|
f25519_mul__distinct(s, x, x);
|
||||||
f25519_mul__distinct(r, s, x);
|
f25519_mul__distinct(r, s, x);
|
||||||
#pragma unroll
|
#pragma unroll 248
|
||||||
for (int i = 0; i < 248; i++) {
|
for (int i = 0; i < 248; i++) {
|
||||||
f25519_mul__distinct(s, r, r);
|
f25519_mul__distinct(s, r, r);
|
||||||
f25519_mul__distinct(r, s, x);
|
f25519_mul__distinct(r, s, x);
|
||||||
|
@ -28,15 +28,19 @@ __device__ __constant__ unsigned long round_k[80] = {
|
|||||||
0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL, 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL,
|
0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL, 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL,
|
||||||
};
|
};
|
||||||
__device__ __forceinline__ unsigned long load64(const unsigned char* __restrict__ x) {
|
__device__ __forceinline__ unsigned long load64(const unsigned char* __restrict__ x) {
|
||||||
return ((unsigned long)x[0] << 56) | ((unsigned long)x[1] << 48) | ((unsigned long)x[2] << 40) | ((unsigned long)x[3] << 32)
|
return (static_cast<unsigned long>(x[0]) << 56) |
|
||||||
| ((unsigned long)x[4] << 24) | ((unsigned long)x[5] << 16) | ((unsigned long)x[6] << 8) | ((unsigned long)x[7]);
|
(static_cast<unsigned long>(x[1]) << 48) |
|
||||||
|
(static_cast<unsigned long>(x[2]) << 40) |
|
||||||
|
(static_cast<unsigned long>(x[3]) << 32) |
|
||||||
|
(static_cast<unsigned long>(x[4]) << 24) |
|
||||||
|
(static_cast<unsigned long>(x[5]) << 16) |
|
||||||
|
(static_cast<unsigned long>(x[6]) << 8) |
|
||||||
|
(static_cast<unsigned long>(x[7]));
|
||||||
}
|
}
|
||||||
__device__ __forceinline__ void store64(unsigned char* __restrict__ x, unsigned long v) {
|
__device__ __forceinline__ void store64(unsigned char* __restrict__ x, unsigned long v) {
|
||||||
unsigned char val = 56;
|
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for (unsigned char i = 0; i < 8; i++) {
|
for (int i = 0; i < 8; i++) {
|
||||||
x[i] = (unsigned char)(v >> val);
|
x[i] = (unsigned char)(v >> (56 - i * 8));
|
||||||
val -= 8;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__device__ __forceinline__ unsigned long rot64(unsigned long x, int bits) {
|
__device__ __forceinline__ unsigned long rot64(unsigned long x, int bits) {
|
||||||
@ -65,8 +69,8 @@ __device__ __forceinline__ void sha512_block(sha512_state* __restrict__ s, const
|
|||||||
unsigned long s0 = rot64(w[idx1], 1) ^ rot64(w[idx1], 8) ^ (w[idx1] >> 7);
|
unsigned long s0 = rot64(w[idx1], 1) ^ rot64(w[idx1], 8) ^ (w[idx1] >> 7);
|
||||||
unsigned long s1 = rot64(w[idx14], 19) ^ rot64(w[idx14], 61) ^ (w[idx14] >> 6);
|
unsigned long s1 = rot64(w[idx14], 19) ^ rot64(w[idx14], 61) ^ (w[idx14] >> 6);
|
||||||
unsigned long S0 = rot64(a, 28) ^ rot64(a, 34) ^ rot64(a, 39);
|
unsigned long S0 = rot64(a, 28) ^ rot64(a, 34) ^ rot64(a, 39);
|
||||||
unsigned long S1 = rot64(e, 14) ^ rot64(e, 18) ^ rot64(e, 41);
|
unsigned long S1 = rot64(e, 14) ^ rot64(e, 18) ^ (rot64(e, 41));
|
||||||
unsigned long temp1 = h + S1 + ((e & f) ^ ((~e) & g)) + round_k[i] + w[idx];
|
unsigned long temp1 = h + S1 + ((e & f) ^ (~e & g)) + round_k[i] + w[idx];
|
||||||
unsigned long temp2 = S0 + ((a & b) ^ (a & c) ^ (b & c));
|
unsigned long temp2 = S0 + ((a & b) ^ (a & c) ^ (b & c));
|
||||||
h = g;
|
h = g;
|
||||||
g = f;
|
g = f;
|
||||||
@ -101,15 +105,11 @@ __device__ void sha512_final(sha512_state* s, const unsigned char* blk) {
|
|||||||
}
|
}
|
||||||
__device__ void sha512_get(const sha512_state* s, unsigned char* hash) {
|
__device__ void sha512_get(const sha512_state* s, unsigned char* hash) {
|
||||||
unsigned len = 64;
|
unsigned len = 64;
|
||||||
if (0 > 128) return;
|
|
||||||
if (len > 128) len = 128;
|
if (len > 128) len = 128;
|
||||||
unsigned i = 0, c = 8;
|
unsigned i = 0, c = (len < 8) ? len : 8;
|
||||||
unsigned char tmp[8];
|
store64(hash, s->h[i]);
|
||||||
store64(tmp, s->h[i]);
|
|
||||||
if (c > len) c = len;
|
|
||||||
memcpy(hash, tmp, c);
|
|
||||||
len -= c;
|
|
||||||
hash += c;
|
hash += c;
|
||||||
|
len -= c;
|
||||||
i++;
|
i++;
|
||||||
while (len >= 8) {
|
while (len >= 8) {
|
||||||
store64(hash, s->h[i]);
|
store64(hash, s->h[i]);
|
||||||
@ -117,7 +117,7 @@ __device__ void sha512_get(const sha512_state* s, unsigned char* hash) {
|
|||||||
len -= 8;
|
len -= 8;
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
if (len) {
|
if (len > 0) {
|
||||||
unsigned char tmp[8];
|
unsigned char tmp[8];
|
||||||
store64(tmp, s->h[i]);
|
store64(tmp, s->h[i]);
|
||||||
memcpy(hash, tmp, len);
|
memcpy(hash, tmp, len);
|
||||||
|
@ -91,7 +91,7 @@ __global__ void KeyGen(curandState* randStates) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
int main(int argc, char* argv[]) {
|
int main(int argc, char* argv[]) {
|
||||||
const int thPerBlock = 256;
|
const int thPerBlock = 128;
|
||||||
int* d_result, mBpSM, h_high;
|
int* d_result, mBpSM, h_high;
|
||||||
char** d_argv;
|
char** d_argv;
|
||||||
cudaDeviceProp prop;
|
cudaDeviceProp prop;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user