diff --git a/build/Makefile b/build/Makefile index e425dc0..85f8605 100644 --- a/build/Makefile +++ b/build/Makefile @@ -1,11 +1,18 @@ NVCC := nvcc -NVCC_FLAGS := -rdc=true -O3 -Xptxas -O3 \ - -gencode arch=compute_75,code=sm_75 \ - --default-stream per-thread \ - -Wno-deprecated-gpu-targets \ - --expt-relaxed-constexpr \ - -I../libs/ \ - -std=c++20 +NVCC_FLAGS := -rdc=true -O3 -Xptxas -O3 \ +-use_fast_math -ftz=true -prec-div=false -prec-sqrt=false \ +-gencode arch=compute_75,code=sm_75 \ +--default-stream per-thread \ +-Wno-deprecated-gpu-targets \ +--expt-relaxed-constexpr \ +-I../libs/ \ +-std=c++17 +BUILD ?= RELEASE +ifeq ($(BUILD),DEBUG) + BUILD_DEFINES := -DDEBUG +else + BUILD_DEFINES := -DRELEASE +endif MAIN_SOURCE := ../sources/main.cu LIBS_DIR := ../libs/ BUILD_DIR := ../build @@ -21,4 +28,4 @@ $(TARGET): $(MAIN_SOURCE) $(LIBS_OBJECTS) $(BUILD_DIR)/%.o: $(LIBS_DIR)%.cu | $(BUILD_DIR) $(NVCC) $(NVCC_FLAGS) -c $< -o $@ clean: - @rm -f $(BUILD_DIR)/*.o $(TARGET) + @rm -f $(BUILD_DIR)/*.o $(TARGET) \ No newline at end of file diff --git a/libs/ed25519.cu b/libs/ed25519.cu index 316d845..bb82bfe 100644 --- a/libs/ed25519.cu +++ b/libs/ed25519.cu @@ -34,7 +34,7 @@ __device__ void ed25519_unproject(unsigned char* x, unsigned char* y, const stru f25519_normalize(x); f25519_normalize(y); } -__device__ void ed25519_pack(unsigned char* c, const unsigned char* x, const unsigned char* y) { +__device__ void ed25519_pack(unsigned char* __restrict__ c, const unsigned char* x, const unsigned char* y) { unsigned char tmp[F25519_SIZE]; unsigned char parity; f25519_copy(tmp, x); diff --git a/libs/edsign.cu b/libs/edsign.cu index 477998d..beaaeb1 100644 --- a/libs/edsign.cu +++ b/libs/edsign.cu @@ -23,10 +23,17 @@ __device__ void edsign_sec_to_pub(unsigned char* pub, const unsigned char* secre expand_key(expanded, secret); sm_pack(pub, expanded); } -__device__ void compact_wipe(void* data, unsigned long length) { +__device__ void compact_wipe(void* __restrict__ data, unsigned long length) { volatile unsigned char* p = (volatile unsigned char*)data; - while (length--) { - *p++ = 0; + unsigned long i = 0; + for (; i + 3 < length; i += 4) { + p[i] = 0; + p[i + 1] = 0; + p[i + 2] = 0; + p[i + 3] = 0; + } + for (; i < length; i++) { + p[i] = 0; } } __device__ void ed25519_keygen(unsigned char private_key[64], unsigned char public_key[32], unsigned char random_seed[32]) { diff --git a/libs/edsign.cuh b/libs/edsign.cuh index d83e544..629a9a6 100644 --- a/libs/edsign.cuh +++ b/libs/edsign.cuh @@ -1,10 +1,9 @@ #ifndef __EDSIGN_CUH #define __EDSIGN_CUH -#define F25519_SIZE 32 __device__ void expand_key(unsigned char* expanded, const unsigned char* secret); __device__ void pp(unsigned char* packed, const struct ed25519_pt* p); __device__ void sm_pack(unsigned char* r, const unsigned char* k); __device__ void edsign_sec_to_pub(unsigned char* pub, const unsigned char* secret); -__device__ void compact_wipe(void* data, unsigned long length); +__device__ void compact_wipe(void* __restrict__ data, unsigned long length); __device__ void ed25519_keygen(unsigned char private_key[64], unsigned char public_key[32], unsigned char random_seed[32]); #endif \ No newline at end of file diff --git a/sources/main.cu b/sources/main.cu index 6ebdb2e..b0db900 100644 --- a/sources/main.cu +++ b/sources/main.cu @@ -69,29 +69,18 @@ __global__ void initRand(curandState* rs) { curand(&rs[id]); } } -__device__ __forceinline__ unsigned long long xorshift128plus(unsigned long long* state) noexcept { - unsigned long long x = state[0]; - const unsigned long long y = state[1]; - state[0] = y; - x ^= x << 23; - x ^= x >> 17; - x ^= y ^ (y >> 26); - state[1] = x; - return x + y; -} -__device__ __forceinline__ void rmbytes(unsigned char* buf, unsigned long long* state) noexcept { +__device__ __forceinline__ void rmbytes(unsigned char* buf, curandState* state) { #pragma unroll 32 - for (unsigned long i = 0; i < 32; i++) buf[i] = static_cast(xorshift128plus(state) & 0xFF); + for (unsigned long i = 0; i < 32; i++) { + buf[i] = curand(state) & 0xFF; + } } __global__ void KeyGen(curandState* randStates) { curandState localState = randStates[blockIdx.x * blockDim.x + threadIdx.x]; - unsigned long long xorshiftState[2]; - xorshiftState[0] = curand(&localState); - xorshiftState[1] = curand(&localState); - Key32 seed; - KeysBox32 keys; while (true) { - rmbytes(seed, xorshiftState); + KeysBox32 keys; + Key32 seed; + rmbytes(seed, &localState); ed25519_keygen(keys.PrivateKey, keys.PublicKey, seed); if (unsigned zeros = getZeros(keys.PublicKey); zeros > atomicMax((unsigned*)&d_high, zeros)) { Address raw;