#include #include #include #include #include #include #include #include #ifdef RELEASE #define THREADSPB 256 #define THDIVTHPB (tTh / THREADSPB) #else #define THREADSPB 1 #define THDIVTHPB 1 #endif __device__ unsigned d_high = 0x10; __device__ int parameters(const char* arg) noexcept { if ((cstring_find(arg, "--altitude") == 0 && cstring_length(arg) == 10) || (cstring_find(arg, "-a") == 0 && cstring_length(arg) == 2)) { return 777; } int space_index = cstring_find(arg, " "); if (space_index == -1) return 0; const int substr_start = space_index + 1; char sub_arg[256]; extract_substring(arg, substr_start, sub_arg, 256); if (cstring_find(arg, "--altitude") != -1 || cstring_find(arg, "-a") != -1) { unsigned tmp_high; if (cstring_to_ull(sub_arg, &tmp_high) != 0) return 1; d_high = tmp_high; } return 0; } __global__ void args(char** argv, int argc, int* result) { int err = 0; for (int x = 1; x < argc; x++) { int res = parameters(argv[x]); if (res == 777) { if (++x >= argc) { err = 776; break; } char combined[512]; concat(argv[x - 1], argv[x], combined, 512); if (parameters(combined) != 0) { err = res; break; } } } result[0] = err; } __device__ __forceinline__ unsigned char zeroCounter(unsigned int x) noexcept { return x ? static_cast(__clz(x)) : 32; } __device__ __forceinline__ unsigned char getZeros(const unsigned char* v) noexcept { unsigned char leadZeros = 0; #pragma unroll for (int i = 0; i < 32; i += 4) { unsigned word = (static_cast(v[i]) << 24) | (static_cast(v[i + 1]) << 16) | (static_cast(v[i + 2]) << 8) | (static_cast(v[i + 3])); if (word == 0) leadZeros += 32; else { leadZeros += zeroCounter(word); break; } } return leadZeros; } /* __global__ void initRandSeed(curandState* states, const unsigned long seed) { int idx = blockIdx.x * blockDim.x + threadIdx.x; curand_init(seed, idx, 0, &states[idx]); } */ __global__ void initRand(curandState* rs, unsigned int* d_seeds) { int id = blockIdx.x * blockDim.x + threadIdx.x; curand_init(clock64() + id * 7919ULL, id, 0, &rs[id]); for (int i = 0; i < 10; i++) { curand(&rs[id]); } unsigned seed = curand(&rs[id]); d_seeds[id] = seed; } int checkSeeds(unsigned* seeds, int count) { for (int i = 0; i < count; i++) { for (int j = i + 1; j < count; j++) { if (seeds[i] == seeds[j]) { return 1; } } } return 0; } __device__ __forceinline__ void rmbytes(unsigned char* buf, curandState* state) { #pragma unroll 8 for (int i = 0; i < 8; i++) { unsigned r = curand(state); buf[i * 4] = static_cast(r & 0xFF); buf[i * 4 + 1] = static_cast((r >> 8) & 0xFF); buf[i * 4 + 2] = static_cast((r >> 16) & 0xFF); buf[i * 4 + 3] = static_cast((r >> 24) & 0xFF); } } __global__ void KeyGenKernel(curandState* randStates) { int idx = blockIdx.x * blockDim.x + threadIdx.x; curandState localState = randStates[idx]; int x = 1; while (x < 0xFFFFFFFF) { Key32 seed; KeysBox32 keys; rmbytes(seed, &localState); ed25519_create_keypair(keys.PrivateKey, keys.PublicKey, seed); if (unsigned zeros = getZeros(keys.PublicKey); zeros > atomicMax(&d_high, zeros)) { Addr16 raw; Key32 inv; invertKey(keys.PublicKey, inv); getRawAddress(zeros, inv, raw); printf("\nIPv6:\t%s\nPK:\t%s\nSK:\t%s\n", getAddr(raw).data, ktos(keys.PublicKey).data, ktos(keys.PrivateKey).data); d_high = zeros; } #ifdef DEBUG if ((++x & 0xFF) == 0) { printf("Iters: %d\n", x); } #endif } } int main(int argc, char* argv[]) { int* d_result, mBpSM, h_high; char** d_argv; cudaDeviceProp prop; curandState* rst; cudaMalloc((void**)&d_result, sizeof(int)); cudaMalloc((void**)&d_argv, argc * sizeof(char*)); for (int i = 0; i < argc; i++) { unsigned long len = strlen(argv[i]) + 1; char* d_str; cudaMalloc((void**)&d_str, len); cudaMemcpy(d_str, argv[i], len, cudaMemcpyHostToDevice); cudaMemcpy(&d_argv[i], &d_str, sizeof(char*), cudaMemcpyHostToDevice); } args<<<1, 1 >>>(d_argv, argc, d_result); cudaDeviceSynchronize(); cudaMemcpyFromSymbol(&h_high, d_high, sizeof(unsigned)); cudaGetDeviceProperties(&prop, 0); cudaOccupancyMaxActiveBlocksPerMultiprocessor(&mBpSM, KeyGenKernel, THREADSPB, 0); const int tTh = mBpSM * prop.multiProcessorCount * THREADSPB; printf("High addrs: 2%02x+\nSMs: %d\nTotalThreads: %d\nBlocks: %d (Threads: %d)\n", h_high, prop.multiProcessorCount, tTh, tTh / THREADSPB, THREADSPB); cudaMalloc(&rst, tTh * sizeof(curandState)); unsigned* d_seeds; cudaMalloc(&d_seeds, tTh * sizeof(unsigned)); initRand<<>>(rst, d_seeds); cudaDeviceSynchronize(); #ifndef DEBUG unsigned* h_seeds = (unsigned*)malloc(tTh * sizeof(unsigned)); cudaMemcpy(h_seeds, d_seeds, tTh * sizeof(unsigned), cudaMemcpyDeviceToHost); if (checkSeeds(h_seeds, tTh)) { fprintf(stderr, "Duplicate seeds found!\n"); free(h_seeds); cudaFree(d_seeds); cudaFree(rst); return 1; } free(h_seeds); cudaFree(d_seeds); #endif KeyGenKernel<<>>(rst); cudaFree(rst); return 0; }