fixes

2025-03-15 19:36:02 +05:00 · 2025-03-15 19:36:02 +05:00 · e6c4f9ceb5
commit e6c4f9ceb5
parent 17f6d7be58
1 changed files with 10 additions and 11 deletions
--- a/sources/main.cu
+++ b/sources/main.cu
@ -81,32 +81,31 @@ __device__ __forceinline__ void rmbytes(unsigned char* buf, curandState* state)
        buf[i * 4 + 3] = static_cast<unsigned char>((r >> 24) & 0xFF);
    }
 }
-__global__ void KeyGen(curandState* randStates) {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    unsigned local_d_high;
+__global__ void KeyGenKernel(curandState* randStates) {
+    __shared__ unsigned local_high;
    if (threadIdx.x == 0) {
-        local_d_high = d_high;
+        local_high = d_high;
    }
    __syncthreads();
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    curandState localState = randStates[idx];
    while (true) {
        Key32 seed;
        KeysBox32 keys;
        rmbytes(seed, &localState);
        ed25519_keygen(keys.PrivateKey, keys.PublicKey, seed);
-        if (unsigned zeros = getZeros(keys.PublicKey); zeros > local_d_high) {
-            if (zeros > atomicMax((unsigned*)&d_high, zeros)) {
+        if (unsigned zeros = getZeros(keys.PublicKey); zeros > local_high) {
+            if (zeros > atomicMax(&d_high, zeros)) {
                Addr16 raw;
                Key32 inv;
                invertKey(keys.PublicKey, inv);
                getRawAddress(zeros, inv, raw);
                printf("\nIPv6:\t%s\nPK:\t%s\nSK:\t%s\n", getAddr(raw).data, ktos(keys.PublicKey).data, ktos(keys.PrivateKey).data);
-                local_d_high = zeros;
+                local_high = zeros;
            }
        }
    }
 }
-
 #define THREADS_P_B 256
 int main(int argc, char* argv[]) {
    int* d_result, mBpSM, h_high;
@ -126,13 +125,13 @@ int main(int argc, char* argv[]) {
    cudaDeviceSynchronize();
    cudaMemcpyFromSymbol(&h_high, d_high, sizeof(unsigned));
    cudaGetDeviceProperties_v2(&prop, 0);
-    cudaOccupancyMaxActiveBlocksPerMultiprocessor(&mBpSM, KeyGen, THREADS_P_B, 0);
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(&mBpSM, KeyGenKernel, THREADS_P_B, 0);
    const int tTh = mBpSM * prop.multiProcessorCount * THREADS_P_B;
-    printf("High addrs: 2%02x+\nSMs: %d\nMaxBlocksPerSM: %d\nTotalTh: %d\nBlocksThreads: %d:%d\n", h_high, prop.multiProcessorCount, mBpSM, tTh, tTh / THREADS_P_B, THREADS_P_B);
+    printf("High addrs: 2%02x+\nSMs: %d\nTotalThreads: %d\nBlocksThreads: %d:%d\n", h_high, prop.multiProcessorCount, tTh, tTh / THREADS_P_B, THREADS_P_B);
    cudaMalloc(&rst, tTh * sizeof(curandState));
    initRand<<<tTh / THREADS_P_B, THREADS_P_B>>>(rst);
    cudaDeviceSynchronize();
-    KeyGen<<<tTh / THREADS_P_B, THREADS_P_B>>>(rst);
+    KeyGenKernel<<<tTh / THREADS_P_B, THREADS_P_B >>>(rst);
    cudaDeviceSynchronize();
    cudaFree(rst);
    return 0;