diff --git a/libs/f25519.cu b/libs/f25519.cu index c955f6a..eb44d1a 100644 --- a/libs/f25519.cu +++ b/libs/f25519.cu @@ -1,20 +1,32 @@ #include +#include +#include __device__ void f25519_copy(unsigned char* __restrict__ x, const unsigned char* __restrict__ a) { -#pragma unroll - for (int i = 0; i < 32; i++) { - x[i] = a[i]; - } + const uint4* src = reinterpret_cast(a); + uint4* dst = reinterpret_cast(x); + dst[0] = src[0]; + dst[1] = src[1]; } __device__ void f25519_select(unsigned char* __restrict__ dst, const unsigned char* __restrict__ zero, const unsigned char* __restrict__ one, unsigned char cond) { - const unsigned char mask = -cond; -#pragma unroll - for (int i = 0; i < 32; i++) { - dst[i] = (zero[i] & ~mask) | (one[i] & mask); - } + unsigned mask = static_cast(-cond); + uint4* d = reinterpret_cast(dst); + const uint4* z = reinterpret_cast(zero); + const uint4* o = reinterpret_cast(one); + uint4 res0, res1; + res0.x = (z[0].x & ~mask) | (o[0].x & mask); + res0.y = (z[0].y & ~mask) | (o[0].y & mask); + res0.z = (z[0].z & ~mask) | (o[0].z & mask); + res0.w = (z[0].w & ~mask) | (o[0].w & mask); + res1.x = (z[1].x & ~mask) | (o[1].x & mask); + res1.y = (z[1].y & ~mask) | (o[1].y & mask); + res1.z = (z[1].z & ~mask) | (o[1].z & mask); + res1.w = (z[1].w & ~mask) | (o[1].w & mask); + d[0] = res0; + d[1] = res1; } __device__ void f25519_normalize(unsigned char* __restrict__ x) { - unsigned char minusp[32]; - unsigned short c = (x[31] >> 7) * 19; + __align__(32) unsigned char minusp[32]; + unsigned c = (x[31] >> 7) * 19; x[31] &= 127; #pragma unroll 32 for (int i = 0; i < 32; i++) { @@ -29,26 +41,27 @@ __device__ void f25519_normalize(unsigned char* __restrict__ x) { minusp[i] = (unsigned char)c; c >>= 8; } - c += x[32 - 1] - 128; - minusp[32 - 1] = (unsigned char)c; + c += x[31] - 128; + minusp[31] = (unsigned char)c; f25519_select(x, minusp, x, (c >> 15) & 1); } __device__ void f25519_add(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) { - unsigned short c = 0; -#pragma unroll + unsigned c = 0; +#pragma unroll 32 for (int i = 0; i < 32; i++) { - c = (c >> 8) + ((unsigned short)a[i]) + ((unsigned short)b[i]); + c = (c >> 8) + ((unsigned)a[i]) + ((unsigned)b[i]); r[i] = (unsigned char)c; } - r[32 - 1] &= 127; + r[31] &= 127; c = (c >> 7) * 19; -#pragma unroll +#pragma unroll 32 for (int i = 0; i < 32; i++) { c += r[i]; r[i] = (unsigned char)c; c >>= 8; } } + __device__ void f25519_sub(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) { unsigned c = 218; int i = 0; @@ -61,13 +74,14 @@ __device__ void f25519_sub(unsigned char* __restrict__ r, const unsigned char* _ c += ((unsigned)a[i]) - ((unsigned)b[i]); r[i] = (unsigned char)(c & 127); c = (c >> 7) * 19; -#pragma unroll +#pragma unroll 32 for (i = 0; i < 32; i++) { c += r[i]; r[i] = (unsigned char)c; c >>= 8; } } + __device__ void f25519_neg(unsigned char* __restrict__ r, const unsigned char* __restrict__ a) { unsigned c = 218; int i = 0; @@ -80,16 +94,17 @@ __device__ void f25519_neg(unsigned char* __restrict__ r, const unsigned char* _ c -= ((unsigned)a[i]); r[i] = (unsigned char)(c & 127); c = (c >> 7) * 19; -#pragma unroll +#pragma unroll 32 for (i = 0; i < 32; i++) { c += r[i]; r[i] = (unsigned char)c; c >>= 8; } } + __device__ void f25519_mul__distinct(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) { unsigned c = 0; -#pragma unroll +#pragma unroll 32 for (int i = 0; i < 32; i++) { c >>= 8; for (int j = 0; j <= i; j++) { @@ -100,20 +115,21 @@ __device__ void f25519_mul__distinct(unsigned char* __restrict__ r, const unsign } r[i] = (unsigned char)c; } - r[32 - 1] &= 127; + r[31] &= 127; c = (c >> 7) * 19; -#pragma unroll +#pragma unroll 32 for (int i = 0; i < 32; i++) { c += r[i]; r[i] = (unsigned char)c; c >>= 8; } } + __device__ void f25519_inv__distinct(unsigned char* __restrict__ r, const unsigned char* __restrict__ x) { - unsigned char s[32]; + __align__(32) unsigned char s[32]; f25519_mul__distinct(s, x, x); f25519_mul__distinct(r, s, x); -#pragma unroll +#pragma unroll 248 for (int i = 0; i < 248; i++) { f25519_mul__distinct(s, r, r); f25519_mul__distinct(r, s, x); @@ -126,4 +142,4 @@ __device__ void f25519_inv__distinct(unsigned char* __restrict__ r, const unsign f25519_mul__distinct(r, s, x); f25519_mul__distinct(s, r, r); f25519_mul__distinct(r, s, x); -} \ No newline at end of file +} diff --git a/libs/sha512.cu b/libs/sha512.cu index b0c74ed..881d17f 100644 --- a/libs/sha512.cu +++ b/libs/sha512.cu @@ -28,15 +28,19 @@ __device__ __constant__ unsigned long round_k[80] = { 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL, 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL, }; __device__ __forceinline__ unsigned long load64(const unsigned char* __restrict__ x) { - return ((unsigned long)x[0] << 56) | ((unsigned long)x[1] << 48) | ((unsigned long)x[2] << 40) | ((unsigned long)x[3] << 32) - | ((unsigned long)x[4] << 24) | ((unsigned long)x[5] << 16) | ((unsigned long)x[6] << 8) | ((unsigned long)x[7]); + return (static_cast(x[0]) << 56) | + (static_cast(x[1]) << 48) | + (static_cast(x[2]) << 40) | + (static_cast(x[3]) << 32) | + (static_cast(x[4]) << 24) | + (static_cast(x[5]) << 16) | + (static_cast(x[6]) << 8) | + (static_cast(x[7])); } __device__ __forceinline__ void store64(unsigned char* __restrict__ x, unsigned long v) { - unsigned char val = 56; #pragma unroll 8 - for (unsigned char i = 0; i < 8; i++) { - x[i] = (unsigned char)(v >> val); - val -= 8; + for (int i = 0; i < 8; i++) { + x[i] = (unsigned char)(v >> (56 - i * 8)); } } __device__ __forceinline__ unsigned long rot64(unsigned long x, int bits) { @@ -65,8 +69,8 @@ __device__ __forceinline__ void sha512_block(sha512_state* __restrict__ s, const unsigned long s0 = rot64(w[idx1], 1) ^ rot64(w[idx1], 8) ^ (w[idx1] >> 7); unsigned long s1 = rot64(w[idx14], 19) ^ rot64(w[idx14], 61) ^ (w[idx14] >> 6); unsigned long S0 = rot64(a, 28) ^ rot64(a, 34) ^ rot64(a, 39); - unsigned long S1 = rot64(e, 14) ^ rot64(e, 18) ^ rot64(e, 41); - unsigned long temp1 = h + S1 + ((e & f) ^ ((~e) & g)) + round_k[i] + w[idx]; + unsigned long S1 = rot64(e, 14) ^ rot64(e, 18) ^ (rot64(e, 41)); + unsigned long temp1 = h + S1 + ((e & f) ^ (~e & g)) + round_k[i] + w[idx]; unsigned long temp2 = S0 + ((a & b) ^ (a & c) ^ (b & c)); h = g; g = f; @@ -101,15 +105,11 @@ __device__ void sha512_final(sha512_state* s, const unsigned char* blk) { } __device__ void sha512_get(const sha512_state* s, unsigned char* hash) { unsigned len = 64; - if (0 > 128) return; if (len > 128) len = 128; - unsigned i = 0, c = 8; - unsigned char tmp[8]; - store64(tmp, s->h[i]); - if (c > len) c = len; - memcpy(hash, tmp, c); - len -= c; + unsigned i = 0, c = (len < 8) ? len : 8; + store64(hash, s->h[i]); hash += c; + len -= c; i++; while (len >= 8) { store64(hash, s->h[i]); @@ -117,7 +117,7 @@ __device__ void sha512_get(const sha512_state* s, unsigned char* hash) { len -= 8; i++; } - if (len) { + if (len > 0) { unsigned char tmp[8]; store64(tmp, s->h[i]); memcpy(hash, tmp, len); diff --git a/sources/main.cu b/sources/main.cu index 348f5b4..e3c30cb 100644 --- a/sources/main.cu +++ b/sources/main.cu @@ -91,7 +91,7 @@ __global__ void KeyGen(curandState* randStates) { } } int main(int argc, char* argv[]) { - const int thPerBlock = 256; + const int thPerBlock = 128; int* d_result, mBpSM, h_high; char** d_argv; cudaDeviceProp prop;