diff --git a/.gitignore b/.gitignore index a8c5689..4412736 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,3 @@ .vscode/ -*.o -*.sh -main -yggm \ No newline at end of file +build/* +-Makefile \ No newline at end of file diff --git a/build/Makefile b/build/Makefile index 44f2703..976b02b 100644 --- a/build/Makefile +++ b/build/Makefile @@ -1,9 +1,9 @@ NVCC := nvcc NVCC_FLAGS := -O3 -use_fast_math -Xptxas -O3 -gencode arch=compute_75,code=sm_75 \ --default-stream per-thread -Wno-deprecated-gpu-targets --expt-relaxed-constexpr -I../libs/ -MAIN_SOURCE := ../sources/main.cu +MAIN_SOURCE := ../sources/main.cu LIBS_DIR := ../libs/ -BUILD_DIR := ../build/ +BUILD_DIR := ../build LIBS_SOURCES := $(wildcard $(LIBS_DIR)*.cu) LIBS_OBJECTS := $(patsubst $(LIBS_DIR)%.cu,$(BUILD_DIR)/%.o,$(LIBS_SOURCES)) TARGET := main @@ -14,5 +14,5 @@ $(TARGET): $(MAIN_SOURCE) $(LIBS_OBJECTS) $(NVCC) $(NVCC_FLAGS) -o $(TARGET) $(MAIN_SOURCE) $(LIBS_OBJECTS) $(BUILD_DIR)/%.o: $(LIBS_DIR)%.cu | $(BUILD_DIR) $(NVCC) $(NVCC_FLAGS) -c $< -o $@ -#clean: -# rm -f $(BUILD_DIR)/*.o \ No newline at end of file +clean: + rm $(BUILD_DIR)/main \ No newline at end of file diff --git a/libs/edsign.cuh b/libs/edsign.cuh index 0708fc1..1d49c7f 100644 --- a/libs/edsign.cuh +++ b/libs/edsign.cuh @@ -1,7 +1,6 @@ #pragma once #include #include -#include __device__ __forceinline__ void expand_key(unsigned char* expanded, const unsigned char* secret) { struct sha512_state s; sha512_init(&s); @@ -23,4 +22,16 @@ __device__ __forceinline__ void edsign_sec_to_pub(unsigned char* pub, const unsi unsigned char expanded[64]; expand_key(expanded, secret); sm_pack(pub, expanded); +} +__device__ void compact_wipe(void* data, unsigned long length) { + volatile unsigned char* p = (volatile unsigned char*)data; + while (length--) { + *p++ = 0; + } +} +__device__ void ed25519_keygen(unsigned char private_key[64], unsigned char public_key[32], unsigned char random_seed[32]) { + edsign_sec_to_pub(public_key, random_seed); + memcpy(private_key, random_seed, 32); + memcpy(private_key + 32, public_key, 32); + compact_wipe(random_seed, 32); } \ No newline at end of file diff --git a/libs/f25519.cuh b/libs/f25519.cuh index 49e0a28..a3fc165 100644 --- a/libs/f25519.cuh +++ b/libs/f25519.cuh @@ -1,7 +1,5 @@ #pragma once #define F25519_SIZE 32 -__device__ __constant__ unsigned char f25519_zero[F25519_SIZE] = { 0 }; -__device__ __constant__ unsigned char f25519_one[F25519_SIZE] = { 1 }; __device__ __forceinline__ void f25519_load(unsigned char* __restrict__ x, unsigned int c) { #pragma unroll for (unsigned int i = 0; i < sizeof(c); i++) { @@ -47,16 +45,6 @@ __device__ __forceinline__ void f25519_normalize(unsigned char* __restrict__ x) minusp[F25519_SIZE - 1] = (unsigned char)c; f25519_select(x, minusp, x, (c >> 15) & 1); } -__device__ __forceinline__ unsigned char f25519_eq(const unsigned char* __restrict__ x, const unsigned char* __restrict__ y) { - unsigned char s = 0; -#pragma unroll - for (int i = 0; i < F25519_SIZE; i++) - s |= x[i] ^ y[i]; - s |= s >> 4; - s |= s >> 2; - s |= s >> 1; - return (s ^ 1) & 1; -} __device__ __forceinline__ void f25519_add(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) { unsigned short c = 0; #pragma unroll @@ -74,15 +62,15 @@ __device__ __forceinline__ void f25519_add(unsigned char* __restrict__ r, const } } __device__ __forceinline__ void f25519_sub(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) { - uint32_t c = 218; + unsigned c = 218; int i = 0; #pragma unroll for (i = 0; i + 1 < F25519_SIZE; i++) { - c += 65280 + ((uint32_t)a[i]) - ((uint32_t)b[i]); + c += 65280 + ((unsigned)a[i]) - ((unsigned)b[i]); r[i] = (unsigned char)c; c >>= 8; } - c += ((uint32_t)a[i]) - ((uint32_t)b[i]); + c += ((unsigned)a[i]) - ((unsigned)b[i]); r[i] = (unsigned char)(c & 127); c = (c >> 7) * 19; #pragma unroll @@ -93,15 +81,15 @@ __device__ __forceinline__ void f25519_sub(unsigned char* __restrict__ r, const } } __device__ __forceinline__ void f25519_neg(unsigned char* __restrict__ r, const unsigned char* __restrict__ a) { - uint32_t c = 218; + unsigned c = 218; int i = 0; #pragma unroll for (i = 0; i + 1 < F25519_SIZE; i++) { - c += 65280 - ((uint32_t)a[i]); + c += 65280 - ((unsigned)a[i]); r[i] = (unsigned char)c; c >>= 8; } - c -= ((uint32_t)a[i]); + c -= ((unsigned)a[i]); r[i] = (unsigned char)(c & 127); c = (c >> 7) * 19; #pragma unroll @@ -112,15 +100,15 @@ __device__ __forceinline__ void f25519_neg(unsigned char* __restrict__ r, const } } __device__ __forceinline__ void f25519_mul__distinct(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) { - uint32_t c = 0; + unsigned c = 0; #pragma unroll for (int i = 0; i < F25519_SIZE; i++) { c >>= 8; for (int j = 0; j <= i; j++) { - c += ((uint32_t)a[j]) * ((uint32_t)b[i - j]); + c += ((unsigned)a[j]) * ((unsigned)b[i - j]); } for (int j = i + 1; j < F25519_SIZE; j++) { - c += ((uint32_t)a[j]) * ((uint32_t)b[F25519_SIZE + i - j]) * 38; + c += ((unsigned)a[j]) * ((unsigned)b[F25519_SIZE + i - j]) * 38; } r[i] = (unsigned char)c; } diff --git a/libs/fprime.cuh b/libs/fprime.cuh deleted file mode 100644 index 2998c20..0000000 --- a/libs/fprime.cuh +++ /dev/null @@ -1,88 +0,0 @@ -#pragma once -#include -__device__ void raw_add(unsigned char* x, const unsigned char* p) { - unsigned short c = 0; - for (int i = 0; i < 32; i++) { - c += ((unsigned short)x[i]) + ((unsigned short)p[i]); - x[i] = (unsigned char)c; - c >>= 8; - } -} -__device__ void fprime_select(unsigned char* dst, const unsigned char* zero, const unsigned char* one, unsigned char condition) { - const unsigned char mask = -condition; - for (int i = 0; i < 32; i++) - dst[i] = zero[i] ^ (mask & (one[i] ^ zero[i])); -} -__device__ void raw_try_sub(unsigned char* x, const unsigned char* p) { - unsigned char minusp[32]; - unsigned short c = 0; - for (int i = 0; i < 32; i++) { - c = ((unsigned short)x[i]) - ((unsigned short)p[i]) - c; - minusp[i] = (unsigned char)c; - c = (c >> 8) & 1; - } - fprime_select(x, minusp, x, c); -} -__device__ int prime_msb(const unsigned char* p) { - int i; - unsigned char x; - for (i = 32 - 1; i >= 0; i--) { - if (p[i]) break; - } - x = p[i]; - i <<= 3; - while (x) { - x >>= 1; - i++; - } - return i - 1; -} -__device__ void shift_n_bits(unsigned char* x, int n) { - unsigned short c = 0; - for (int i = 0; i < 32; i++) { - c |= ((unsigned short)x[i]) << n; - x[i] = (unsigned char)c; - c >>= 8; - } -} -__device__ inline int min_int(int a, int b) { - return a < b ? a : b; -} -__device__ void fprime_from_bytes(unsigned char* n, const unsigned char* x, unsigned long len, const unsigned char* modulus) { - const int preload_total = min_int(prime_msb(modulus) - 1, (int)(len << 3)); - const int preload_bytes = preload_total >> 3; - const int preload_bits = preload_total & 7; - const int rbits = (len << 3) - preload_total; - memset(n, 0, 32); - for (int i = 0; i < preload_bytes; i++) - n[i] = x[len - preload_bytes + i]; - if (preload_bits) { - shift_n_bits(n, preload_bits); - n[0] |= x[len - preload_bytes - 1] >> (8 - preload_bits); - } - for (int i = rbits - 1; i >= 0; i--) { - const unsigned char bit = (x[i >> 3] >> (i & 7)) & 1; - shift_n_bits(n, 1); - n[0] |= bit; - raw_try_sub(n, modulus); - } -} -__device__ void fprime_add(unsigned char* r, const unsigned char* a, const unsigned char* modulus) { - raw_add(r, a); - raw_try_sub(r, modulus); -} -__device__ inline void fprime_copy(unsigned char* x, const unsigned char* a) { - memcpy(x, a, 32); -} -__device__ void fprime_mul(unsigned char* r, const unsigned char* a, const unsigned char* b, const unsigned char* modulus) { - memset(r, 0, 32); - for (int i = prime_msb(modulus); i >= 0; i--) { - const unsigned char bit = (b[i >> 3] >> (i & 7)) & 1; - unsigned char plusa[32]; - shift_n_bits(r, 1); - raw_try_sub(r, modulus); - fprime_copy(plusa, r); - fprime_add(plusa, a, modulus); - fprime_select(r, r, plusa, bit); - } -} \ No newline at end of file diff --git a/libs/sha512.cuh b/libs/sha512.cuh index 264228b..e36b576 100644 --- a/libs/sha512.cuh +++ b/libs/sha512.cuh @@ -8,7 +8,7 @@ __device__ __constant__ sha512_state sha512_initial_state = { { 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL, } }; -__device__ __constant__ uint64_t round_k[80] = { +__device__ __constant__ unsigned long round_k[80] = { 0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, 0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL, 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL, @@ -50,11 +50,11 @@ __device__ __constant__ uint64_t round_k[80] = { 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL, 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL, }; -__device__ __forceinline__ uint64_t load64(const unsigned char* x) { - return ((uint64_t)x[0] << 56) | ((uint64_t)x[1] << 48) | ((uint64_t)x[2] << 40) | ((uint64_t)x[3] << 32) - | ((uint64_t)x[4] << 24) | ((uint64_t)x[5] << 16) | ((uint64_t)x[6] << 8) | ((uint64_t)x[7]); +__device__ __forceinline__ unsigned long load64(const unsigned char* x) { + return ((unsigned long)x[0] << 56) | ((unsigned long)x[1] << 48) | ((unsigned long)x[2] << 40) | ((unsigned long)x[3] << 32) + | ((unsigned long)x[4] << 24) | ((unsigned long)x[5] << 16) | ((unsigned long)x[6] << 8) | ((unsigned long)x[7]); } -__device__ __forceinline__ void store64(unsigned char* x, uint64_t v) { +__device__ __forceinline__ void store64(unsigned char* x, unsigned long v) { x[0] = (unsigned char)(v >> 56); x[1] = (unsigned char)(v >> 48); x[2] = (unsigned char)(v >> 40); @@ -64,37 +64,37 @@ __device__ __forceinline__ void store64(unsigned char* x, uint64_t v) { x[6] = (unsigned char)(v >> 8); x[7] = (unsigned char)(v); } -__device__ __forceinline__ uint64_t rot64(uint64_t x, int bits) { +__device__ __forceinline__ unsigned long rot64(unsigned long x, int bits) { return (x >> bits) | (x << (64 - bits)); } __device__ void sha512_block(sha512_state* s, const unsigned char* blk) { - uint64_t w[16]; + unsigned long w[16]; #pragma unroll for (int i = 0; i < 16; i++) { w[i] = load64(blk + i * 8); } - uint64_t a = s->h[0]; - uint64_t b = s->h[1]; - uint64_t c = s->h[2]; - uint64_t d = s->h[3]; - uint64_t e = s->h[4]; - uint64_t f = s->h[5]; - uint64_t g = s->h[6]; - uint64_t h = s->h[7]; + unsigned long a = s->h[0]; + unsigned long b = s->h[1]; + unsigned long c = s->h[2]; + unsigned long d = s->h[3]; + unsigned long e = s->h[4]; + unsigned long f = s->h[5]; + unsigned long g = s->h[6]; + unsigned long h = s->h[7]; #pragma unroll for (int i = 0; i < 80; i++) { const int idx = i & 15; const int idx1 = (i + 1) & 15; const int idx7 = (i + 9) & 15; const int idx14 = (i + 14) & 15; - uint64_t s0 = rot64(w[idx1], 1) ^ rot64(w[idx1], 8) ^ (w[idx1] >> 7); - uint64_t s1 = rot64(w[idx14], 19) ^ rot64(w[idx14], 61) ^ (w[idx14] >> 6); - uint64_t S0 = rot64(a, 28) ^ rot64(a, 34) ^ rot64(a, 39); - uint64_t S1 = rot64(e, 14) ^ rot64(e, 18) ^ rot64(e, 41); - uint64_t ch = (e & f) ^ ((~e) & g); - uint64_t temp1 = h + S1 + ch + round_k[i] + w[idx]; - uint64_t maj = (a & b) ^ (a & c) ^ (b & c); - uint64_t temp2 = S0 + maj; + unsigned long s0 = rot64(w[idx1], 1) ^ rot64(w[idx1], 8) ^ (w[idx1] >> 7); + unsigned long s1 = rot64(w[idx14], 19) ^ rot64(w[idx14], 61) ^ (w[idx14] >> 6); + unsigned long S0 = rot64(a, 28) ^ rot64(a, 34) ^ rot64(a, 39); + unsigned long S1 = rot64(e, 14) ^ rot64(e, 18) ^ rot64(e, 41); + unsigned long ch = (e & f) ^ ((~e) & g); + unsigned long temp1 = h + S1 + ch + round_k[i] + w[idx]; + unsigned long maj = (a & b) ^ (a & c) ^ (b & c); + unsigned long temp2 = S0 + maj; h = g; g = f; f = e; diff --git a/sources/main.cu b/sources/main.cu index 889f0b0..ae7e557 100644 --- a/sources/main.cu +++ b/sources/main.cu @@ -1,24 +1,16 @@ -#include -#include -#include -#include +#include #include #include #include -#include #include #include -__device__ __constant__ char hexDigitsConst[17] = "0123456789abcdef"; using Address = unsigned char[16]; using Key = unsigned char[32]; struct KeysBox { Key PublicKey; Key PrivateKey; }; -struct option { - unsigned high = 0x10; -}; -__device__ static option conf; +__device__ static unsigned high = 0x10; struct ds64 { char data[65]; }; @@ -27,7 +19,7 @@ struct ds46 { }; __device__ ds64 KeyToString(const unsigned char* key) noexcept { ds64 str; - const char* hexDigits = hexDigitsConst; + const char* hexDigits = "0123456789abcdef"; for (unsigned char i = 0; i < 32; i++) { str.data[2 * i] = hexDigits[key[i] >> 4]; str.data[2 * i + 1] = hexDigits[key[i] & 0x0F]; @@ -37,13 +29,9 @@ __device__ ds64 KeyToString(const unsigned char* key) noexcept { } __device__ ds46 getAddress(const unsigned char rawAddr[16]) noexcept { ds46 addrStr; -#ifdef __CUDA_ARCH__ - const char* hexDigits = hexDigitsConst; -#else const char* hexDigits = "0123456789abcdef"; -#endif - int pos = 0; - for (int group = 0; group < 8; group++) { + unsigned pos = 0; + for (unsigned char group = 0; group < 8; group++) { int idx = group * 2; addrStr.data[pos++] = hexDigits[rawAddr[idx] >> 4]; addrStr.data[pos++] = hexDigits[rawAddr[idx] & 0x0F]; @@ -69,7 +57,28 @@ __device__ void getRawAddress(int lErase, Key& InvertedPublicKey, Address& rawAd rawAddr[1] = static_cast(lErase - 1); memcpy(&rawAddr[2], &InvertedPublicKey[start], 14); } -__device__ unsigned long long xorshift128plus(unsigned long long* state) { +__device__ unsigned char zeroCounter(unsigned int x) { + if (x == 0) return 32; + return static_cast(__builtin_clz(x)); +} +__device__ unsigned char getZeros(const unsigned char* v) { + unsigned char leadZeros = 0; + for (int i = 0; i < 32; i += 4) { + unsigned word = (static_cast(v[i]) << 24) | (static_cast(v[i + 1]) << 16) | (static_cast(v[i + 2]) << 8) | (static_cast(v[i + 3])); + if (word == 0) { + leadZeros += 32; + } else { + leadZeros += zeroCounter(word); + break; + } + } + return leadZeros; +} +__global__ void initRand(curandState* randStates) { + int id = blockIdx.x * blockDim.x + threadIdx.x; + curand_init((unsigned long long)clock64() + id, id, 0, &randStates[id]); +} +__device__ unsigned long long xorshift128plus(unsigned long long* state) noexcept { unsigned long long x = state[0]; const unsigned long long y = state[1]; state[0] = y; @@ -79,80 +88,52 @@ __device__ unsigned long long xorshift128plus(unsigned long long* state) { state[1] = x; return x + y; } -__device__ unsigned char zeroCounter(unsigned int x) { - if (x == 0) - return 32; -#ifdef __CUDA_ARCH__ - return static_cast(__clz(x)); -#else - return static_cast(__builtin_clz(x)); -#endif -} -__device__ unsigned char getZeros(const unsigned char* v) { - unsigned char leadZeros = 0; - for (int i = 0; i < 32; i += 4) { - unsigned word = (static_cast(v[i]) << 24) | (static_cast(v[i + 1]) << 16) | (static_cast(v[i + 2]) << 8) | (static_cast(v[i + 3])); - if (word == 0) - leadZeros += 32; - else { - leadZeros += zeroCounter(word); - break; - } - } - return leadZeros; -} -__global__ void initRandStates(curandState* randStates) { - int id = blockIdx.x * blockDim.x + threadIdx.x; - curand_init((unsigned long long)clock64() + id, id, 0, &randStates[id]); -} -__device__ void generateRandomBytes(unsigned char* buf, unsigned long size, curandState* state) { +__device__ void rmbytes(unsigned char* buf, unsigned long size, unsigned long long* state) { for (unsigned long i = 0; i < size; i++) { - buf[i] = curand(state) & 0xFF; + buf[i] = xorshift128plus(state) & 0xFF; } } __device__ void invertKey(const unsigned char* key, unsigned char* inverted) { for (int i = 0; i < 32; i++) inverted[i] = key[i] ^ 0xFF; } -__device__ void compact_wipe(void* data, unsigned long length) { - volatile unsigned char* p = (volatile unsigned char*)data; - while (length--) { - *p++ = 0; - } -} -__device__ void ed25519_keygen(unsigned char private_key[64], unsigned char public_key[32], unsigned char random_seed[32]) { - edsign_sec_to_pub(public_key, random_seed); - memcpy(private_key, random_seed, 32); - memcpy(private_key + 32, public_key, 32); - compact_wipe(random_seed, 32); -} -__global__ __launch_bounds__(256) void minerKernel(curandState* randStates) { +__global__ void minerKernel(curandState* randStates) { int thid = blockIdx.x * blockDim.x + threadIdx.x; curandState localState = randStates[thid]; + unsigned long long xorshiftState[2]; + xorshiftState[0] = curand(&localState); + xorshiftState[1] = curand(&localState); Key seed; - generateRandomBytes(seed, sizeof(seed), &localState); + rmbytes(seed, sizeof(seed), xorshiftState); + if (thid == 0) printf("Seed: %s\n", KeyToString(seed).data); while (true) { - generateRandomBytes(seed, sizeof(seed), &localState); KeysBox keys; ed25519_keygen(keys.PrivateKey, keys.PublicKey, seed); - if (unsigned zeros = getZeros(keys.PublicKey); zeros > atomicMax(&conf.high, (unsigned)zeros)) { - Key inv; - Address raw; - invertKey(keys.PublicKey, inv); - getRawAddress(zeros, inv, raw); - printf("\nIPv6:\t%s\nPK:\t%s\nSK:\t%s\n", getAddress(raw).data, KeyToString(keys.PublicKey).data, KeyToString(keys.PrivateKey).data); + if (unsigned zeros = getZeros(keys.PublicKey); zeros > atomicMax(&high, (unsigned)zeros)) { + printf("\nIPv6:\t%x\nPK:\t%s\nSK:\t%s\n", zeros, KeyToString(keys.PublicKey).data, KeyToString(keys.PrivateKey).data); } - __syncthreads(); + rmbytes(seed, sizeof(seed), xorshiftState); } - //randStates[thid] = localState; } int main() { - curandState* d_randStates; - cudaMalloc(&d_randStates, 1024 * sizeof(curandState)); - initRandStates << <4, 256 >> > (d_randStates); + const int threadsPerBlock = 256; + cudaDeviceProp prop; + cudaGetDeviceProperties_v2(&prop, 0); + int mBpSM; + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&mBpSM, minerKernel, threadsPerBlock, 0); + int SMs = prop.multiProcessorCount; + int maxBlocks = mBpSM * SMs; + const int totalThreads = maxBlocks * threadsPerBlock; + printf("SMs: %d\n", SMs); + printf("maxBlocks: %d\n", maxBlocks); + printf("totalThreads: %d\n", totalThreads); + printf("MaxBlocksPerSM: %d\n", mBpSM); + curandState* rst; + cudaMalloc(&rst, totalThreads * sizeof(curandState)); + initRand<<<100, threadsPerBlock >>>(rst); cudaDeviceSynchronize(); - minerKernel << <4, 256 >> > (d_randStates); + minerKernel<<<100, threadsPerBlock>>>(rst); cudaDeviceSynchronize(); - cudaFree(d_randStates); + cudaFree(rst); return 0; -} \ No newline at end of file +}