diff --git a/build/Makefile b/build/Makefile index 34d745f..e425dc0 100644 --- a/build/Makefile +++ b/build/Makefile @@ -1,5 +1,5 @@ NVCC := nvcc -NVCC_FLAGS := -rdc=true -O3 -use_fast_math -Xptxas -O3 \ +NVCC_FLAGS := -rdc=true -O3 -Xptxas -O3 \ -gencode arch=compute_75,code=sm_75 \ --default-stream per-thread \ -Wno-deprecated-gpu-targets \ diff --git a/libs/ed25519.cu b/libs/ed25519.cu index c8eae72..ee98511 100644 --- a/libs/ed25519.cu +++ b/libs/ed25519.cu @@ -94,6 +94,7 @@ __device__ __forceinline__ void ed25519_copy(struct ed25519_pt* dst, const struc } __device__ void ed25519_smult(struct ed25519_pt* r_out, const struct ed25519_pt* p, const unsigned char* e) { struct ed25519_pt r = ed25519_neutral; +#pragma unroll 256 for (int i = 255; i >= 0; i--) { struct ed25519_pt s; ed25519_double(&r, &r); diff --git a/libs/sha512.cu b/libs/sha512.cu index c7b0256..ef87985 100644 --- a/libs/sha512.cu +++ b/libs/sha512.cu @@ -66,7 +66,7 @@ __device__ __forceinline__ unsigned long rot64(unsigned long x, int bits) { } __device__ void sha512_block(sha512_state* s, const unsigned char* blk) { unsigned long w[16]; -#pragma unroll +#pragma unroll 16 for (int i = 0; i < 16; i++) { w[i] = load64(blk + i * 8); } @@ -78,7 +78,7 @@ __device__ void sha512_block(sha512_state* s, const unsigned char* blk) { unsigned long f = s->h[5]; unsigned long g = s->h[6]; unsigned long h = s->h[7]; -#pragma unroll +#pragma unroll 80 for (int i = 0; i < 80; i++) { const int idx = i & 15; const int idx1 = (i + 1) & 15; diff --git a/sources/main.cpp b/sources/main.cpp index e7a96e9..98c38ce 100644 --- a/sources/main.cpp +++ b/sources/main.cpp @@ -59,6 +59,7 @@ inline std::string getAddress(const Address& rawAddr) noexcept { inline std::string KeyToString(const unsigned char* key) noexcept { char result[65]; const char* hexDigits = "0123456789abcdef"; +#pragma unroll for (unsigned char i = 0; i < 32; i++) { result[2 * i] = hexDigits[key[i] >> 4]; result[2 * i + 1] = hexDigits[key[i] & 0x0F]; @@ -109,7 +110,7 @@ inline void invertKey(const unsigned char* __restrict key, Key& inverted) noexce return static_cast(state * 2685821657736338717); } inline void rmbytes(unsigned char* __restrict buf, unsigned char size, unsigned long& state) noexcept { - for (unsigned char x = 0; x < size / 32; x++) { + for (unsigned char x = 0; x < 32; x++) { _mm256_store_si256((__m256i*) & buf[x * 32], _mm256_set_epi64x(xorshift64(state), xorshift64(state), xorshift64(state), xorshift64(state))); } } diff --git a/sources/main.cu b/sources/main.cu index d77563c..a156703 100644 --- a/sources/main.cu +++ b/sources/main.cu @@ -12,7 +12,7 @@ struct KeysBox { Key PublicKey; Key PrivateKey; }; -__device__ unsigned high = 0x10; +__device__ unsigned d_high = 0x10; __device__ int parameters(const char* arg) { int space_index = cstring_find(arg, " "); if (space_index != -1) { @@ -26,7 +26,7 @@ __device__ int parameters(const char* arg) { unsigned tmp_high; int ret = cstring_to_ull(sub_arg, &tmp_high); if (ret != 0) return 1; - high = tmp_high; + d_high = tmp_high; return 0; } } @@ -63,19 +63,19 @@ struct ds46 { __device__ ds64 ktos(const unsigned char* key) noexcept { ds64 str; const char* hexDigits = "0123456789abcdef"; -#pragma unroll +#pragma unroll 32 for (unsigned char i = 0; i < 32; i++) { str.data[2 * i] = hexDigits[key[i] >> 4]; str.data[2 * i + 1] = hexDigits[key[i] & 0x0F]; } - str.data[64] = '\0'; + str.data[65] = '\0'; return str; } __device__ ds46 getAddr(const unsigned char rawAddr[16]) noexcept { ds46 addrStr; const char* hexDigits = "0123456789abcdef"; unsigned pos = 0; -#pragma unroll +#pragma unroll 8 for (unsigned char group = 0; group < 8; group++) { int idx = group * 2; addrStr.data[pos++] = hexDigits[rawAddr[idx] >> 4]; @@ -132,14 +132,12 @@ __device__ __forceinline__ unsigned long long xorshift128plus(unsigned long long state[1] = x; return x + y; } -__device__ __forceinline__ void rmbytes(unsigned char* buf, unsigned long size, unsigned long long* state) { -#pragma unroll - for (unsigned long i = 0; i < size; i++) { - buf[i] = static_cast(xorshift128plus(state) & 0xFF); - } +__device__ __forceinline__ void rmbytes(unsigned char* buf, unsigned long long* state) { +#pragma unroll 32 + for (unsigned long i = 0; i < 32; i++) buf[i] = static_cast(xorshift128plus(state) & 0xFF); } __device__ __forceinline__ void invertKey(const unsigned char* key, unsigned char* inverted) { -#pragma unroll +#pragma unroll 32 for (unsigned char i = 0; i < 32; i++) inverted[i] = key[i] ^ 0xFF; } __global__ void KeyGen(curandState* randStates) { @@ -148,17 +146,16 @@ __global__ void KeyGen(curandState* randStates) { xorshiftState[0] = curand(&localState); xorshiftState[1] = curand(&localState); Key seed; + KeysBox keys; while (true) { - rmbytes(seed, sizeof(seed), xorshiftState); - KeysBox keys; + rmbytes(seed, xorshiftState); ed25519_keygen(keys.PrivateKey, keys.PublicKey, seed); - unsigned zeros = getZeros(keys.PublicKey); - if (zeros > atomicMax((unsigned*)&high, zeros)) { + if (unsigned zeros = getZeros(keys.PublicKey); zeros > atomicMax((unsigned*)&d_high, zeros)) { Address raw; Key inv; invertKey(keys.PublicKey, inv); getRawAddress(zeros, inv, raw); - printf("\nIPv6:\t%s\nPK:\t%s\nSK:\t%s\nFK:\t%s%s\n", getAddr(raw).data, ktos(keys.PublicKey).data, ktos(keys.PrivateKey).data, ktos(keys.PrivateKey).data, ktos(keys.PublicKey).data); + printf("\nIPv6:\t%s\nPK:\t%s\nSK:\t%s\n", getAddr(raw).data, ktos(keys.PublicKey).data, ktos(keys.PrivateKey).data); } } } @@ -176,20 +173,17 @@ int main(int argc, char* argv[]) { } args<<<1, 1 >>>(d_argv, argc, d_result); unsigned h_high; - cudaMemcpyFromSymbol(&h_high, high, sizeof(unsigned)); + cudaMemcpyFromSymbol(&h_high, d_high, sizeof(unsigned)); printf("High addresses (2%02x+)\n", h_high); - const int threadsPerBlock = 256; + const int threadsPerBlock = 128; cudaDeviceProp prop; cudaGetDeviceProperties(&prop, 0); int mBpSM; cudaOccupancyMaxActiveBlocksPerMultiprocessor(&mBpSM, KeyGen, threadsPerBlock, 0); - int SMs = prop.multiProcessorCount; - int maxBlocks = mBpSM * SMs; - const int totalThreads = maxBlocks * threadsPerBlock; - printf("SMs: %d\n", SMs); - printf("maxBlocks: %d\n", maxBlocks); - printf("totalThreads: %d\n", totalThreads); + const int totalThreads = mBpSM * prop.multiProcessorCount * threadsPerBlock; + printf("SMs: %d\n", prop.multiProcessorCount); printf("MaxBlocksPerSM: %d\n", mBpSM); + printf("totalThreads: %d\n", totalThreads); printf("BlocksThreads: %d:%d\n", totalThreads / threadsPerBlock, threadsPerBlock); curandState* rst; cudaMalloc(&rst, totalThreads * sizeof(curandState));