From e6c4f9ceb53e277c6d573ce4cb1c1b2edbeb6ab9 Mon Sep 17 00:00:00 2001 From: rcxpony Date: Sat, 15 Mar 2025 19:36:02 +0500 Subject: [PATCH] fixes --- sources/main.cu | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/sources/main.cu b/sources/main.cu index 794c98a..229fc44 100644 --- a/sources/main.cu +++ b/sources/main.cu @@ -81,32 +81,31 @@ __device__ __forceinline__ void rmbytes(unsigned char* buf, curandState* state) buf[i * 4 + 3] = static_cast((r >> 24) & 0xFF); } } -__global__ void KeyGen(curandState* randStates) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - unsigned local_d_high; +__global__ void KeyGenKernel(curandState* randStates) { + __shared__ unsigned local_high; if (threadIdx.x == 0) { - local_d_high = d_high; + local_high = d_high; } __syncthreads(); + int idx = blockIdx.x * blockDim.x + threadIdx.x; curandState localState = randStates[idx]; while (true) { Key32 seed; KeysBox32 keys; rmbytes(seed, &localState); ed25519_keygen(keys.PrivateKey, keys.PublicKey, seed); - if (unsigned zeros = getZeros(keys.PublicKey); zeros > local_d_high) { - if (zeros > atomicMax((unsigned*)&d_high, zeros)) { + if (unsigned zeros = getZeros(keys.PublicKey); zeros > local_high) { + if (zeros > atomicMax(&d_high, zeros)) { Addr16 raw; Key32 inv; invertKey(keys.PublicKey, inv); getRawAddress(zeros, inv, raw); printf("\nIPv6:\t%s\nPK:\t%s\nSK:\t%s\n", getAddr(raw).data, ktos(keys.PublicKey).data, ktos(keys.PrivateKey).data); - local_d_high = zeros; + local_high = zeros; } } } } - #define THREADS_P_B 256 int main(int argc, char* argv[]) { int* d_result, mBpSM, h_high; @@ -126,13 +125,13 @@ int main(int argc, char* argv[]) { cudaDeviceSynchronize(); cudaMemcpyFromSymbol(&h_high, d_high, sizeof(unsigned)); cudaGetDeviceProperties_v2(&prop, 0); - cudaOccupancyMaxActiveBlocksPerMultiprocessor(&mBpSM, KeyGen, THREADS_P_B, 0); + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&mBpSM, KeyGenKernel, THREADS_P_B, 0); const int tTh = mBpSM * prop.multiProcessorCount * THREADS_P_B; - printf("High addrs: 2%02x+\nSMs: %d\nMaxBlocksPerSM: %d\nTotalTh: %d\nBlocksThreads: %d:%d\n", h_high, prop.multiProcessorCount, mBpSM, tTh, tTh / THREADS_P_B, THREADS_P_B); + printf("High addrs: 2%02x+\nSMs: %d\nTotalThreads: %d\nBlocksThreads: %d:%d\n", h_high, prop.multiProcessorCount, tTh, tTh / THREADS_P_B, THREADS_P_B); cudaMalloc(&rst, tTh * sizeof(curandState)); initRand<<>>(rst); cudaDeviceSynchronize(); - KeyGen<<>>(rst); + KeyGenKernel<<>>(rst); cudaDeviceSynchronize(); cudaFree(rst); return 0;