diff --git a/libs/keymanip.cu b/libs/keymanip.cu index 6d2984d..107b19a 100644 --- a/libs/keymanip.cu +++ b/libs/keymanip.cu @@ -41,6 +41,7 @@ __device__ void getRawAddress(int lErase, Key32& InvertedPublicKey, Addr16& rawA memcpy(&rawAddr[2], &InvertedPublicKey[start], 14); } __device__ void invertKey(const unsigned char* key, unsigned char* inverted) { +#pragma unroll for (int i = 0; i < 32; i += 4) { uchar4 k = *(reinterpret_cast(&key[i])); *(reinterpret_cast(&inverted[i])) = make_uchar4(k.x ^ 0xFF, k.y ^ 0xFF, k.z ^ 0xFF, k.w ^ 0xFF); diff --git a/sources/main.cu b/sources/main.cu index 811747e..63db9d6 100644 --- a/sources/main.cu +++ b/sources/main.cu @@ -8,7 +8,8 @@ #include __device__ unsigned d_high = 0x10; __device__ int parameters(const char* arg) noexcept { - if ((cstring_find(arg, "--altitude") == 0 && cstring_length(arg) == 10) || (cstring_find(arg, "-a") == 0 && cstring_length(arg) == 2)) { + if ((cstring_find(arg, "--altitude") == 0 && cstring_length(arg) == 10) || + (cstring_find(arg, "-a") == 0 && cstring_length(arg) == 2)) { return 777; } int space_index = cstring_find(arg, " "); @@ -49,7 +50,8 @@ __device__ __forceinline__ unsigned char getZeros(const unsigned char* v) noexce unsigned char leadZeros = 0; #pragma unroll for (int i = 0; i < 32; i += 4) { - unsigned word = (static_cast(v[i]) << 24) | (static_cast(v[i + 1]) << 16) | (static_cast(v[i + 2]) << 8) | (static_cast(v[i + 3])); + unsigned word = (static_cast(v[i]) << 24) | (static_cast(v[i + 1]) << 16) | + (static_cast(v[i + 2]) << 8) | (static_cast(v[i + 3])); if (word == 0) leadZeros += 32; else { @@ -59,19 +61,32 @@ __device__ __forceinline__ unsigned char getZeros(const unsigned char* v) noexce } return leadZeros; } + /* __global__ void initRandSeed(curandState* states, const unsigned long seed) { int idx = blockIdx.x * blockDim.x + threadIdx.x; curand_init(seed, idx, 0, &states[idx]); } */ -__global__ void initRand(curandState* rs) { +__global__ void initRand(curandState* rs, unsigned int* d_seeds) { int id = blockIdx.x * blockDim.x + threadIdx.x; curand_init(clock64() + id * 7919ULL, id, 0, &rs[id]); #pragma unroll 10 for (int i = 0; i < 10; i++) { curand(&rs[id]); } + unsigned int seed = curand(&rs[id]); + d_seeds[id] = seed; +} +int checkSeeds(unsigned int* seeds, int count) { + for (int i = 0; i < count; i++) { + for (int j = i + 1; j < count; j++) { + if (seeds[i] == seeds[j]) { + return 1; + } + } + } + return 0; } __device__ __forceinline__ void rmbytes(unsigned char* buf, curandState* state) { #pragma unroll 8 @@ -84,11 +99,6 @@ __device__ __forceinline__ void rmbytes(unsigned char* buf, curandState* state) } } __global__ void KeyGenKernel(curandState* randStates) { - __shared__ unsigned local_high; - if (threadIdx.x == 0) { - local_high = d_high; - } - __syncthreads(); int idx = blockIdx.x * blockDim.x + threadIdx.x; curandState localState = randStates[idx]; while (true) { @@ -96,15 +106,13 @@ __global__ void KeyGenKernel(curandState* randStates) { KeysBox32 keys; rmbytes(seed, &localState); ed25519_keygen(keys.PrivateKey, keys.PublicKey, seed); - if (unsigned zeros = getZeros(keys.PublicKey); zeros > local_high) { - if (zeros > atomicMax(&d_high, zeros)) { - Addr16 raw; - Key32 inv; - invertKey(keys.PublicKey, inv); - getRawAddress(zeros, inv, raw); - printf("\nIPv6:\t%s\nPK:\t%s\nSK:\t%s\n", getAddr(raw).data, ktos(keys.PublicKey).data, ktos(keys.PrivateKey).data); - local_high = zeros; - } + if (unsigned zeros = getZeros(keys.PublicKey); zeros > atomicMax(&d_high, zeros)) { + Addr16 raw; + Key32 inv; + invertKey(keys.PublicKey, inv); + getRawAddress(zeros, inv, raw); + printf("\nIPv6:\t%s\nPK:\t%s\nSK:\t%s\n", getAddr(raw).data, ktos(keys.PublicKey).data, ktos(keys.PrivateKey).data); + d_high = zeros; } } } @@ -123,18 +131,30 @@ int main(int argc, char* argv[]) { cudaMemcpy(d_str, argv[i], len, cudaMemcpyHostToDevice); cudaMemcpy(&d_argv[i], &d_str, sizeof(char*), cudaMemcpyHostToDevice); } - args<<<1, 1>>>(d_argv, argc, d_result); + args<<<1, 1 >>>(d_argv, argc, d_result); cudaDeviceSynchronize(); cudaMemcpyFromSymbol(&h_high, d_high, sizeof(unsigned)); - cudaGetDeviceProperties_v2(&prop, 0); + cudaGetDeviceProperties(&prop, 0); cudaOccupancyMaxActiveBlocksPerMultiprocessor(&mBpSM, KeyGenKernel, THREADS_P_B, 0); const int tTh = mBpSM * prop.multiProcessorCount * THREADS_P_B; - printf("High addrs: 2%02x+\nSMs: %d\nTotalThreads: %d\nBlocksThreads: %d:%d\n", h_high, prop.multiProcessorCount, tTh, tTh / THREADS_P_B, THREADS_P_B); + printf("High addrs: 2%02x+\nSMs: %d\nTotalThreads: %d\nBlocks: %d (Threads: %d)\n", h_high, prop.multiProcessorCount, tTh, tTh / THREADS_P_B, THREADS_P_B); cudaMalloc(&rst, tTh * sizeof(curandState)); - initRand<<>>(rst); - cudaDeviceSynchronize(); - KeyGenKernel<<>>(rst); + unsigned int* d_seeds; + cudaMalloc(&d_seeds, tTh * sizeof(unsigned int)); + initRand<<>>(rst, d_seeds); cudaDeviceSynchronize(); + unsigned int* h_seeds = (unsigned int*)malloc(tTh * sizeof(unsigned int)); + cudaMemcpy(h_seeds, d_seeds, tTh * sizeof(unsigned int), cudaMemcpyDeviceToHost); + if (checkSeeds(h_seeds, tTh)) { + fprintf(stderr, "Duplicate seeds found!\n"); + free(h_seeds); + cudaFree(d_seeds); + cudaFree(rst); + return EXIT_FAILURE; + } + free(h_seeds); + cudaFree(d_seeds); + KeyGenKernel<<>>(rst); cudaFree(rst); return 0; } \ No newline at end of file