From ab6061f5bf2378c6e99d2a3936f05488b42293b3 Mon Sep 17 00:00:00 2001 From: rcxpony Date: Thu, 21 Aug 2025 13:05:05 +0500 Subject: [PATCH] owo --- README.md | 2 +- libs/defines.h | 3 -- libs/ed25519.cu | 4 +- libs/edsign.cu | 3 +- libs/f25519.cu | 112 +++++++++++++++++++++++++++++------------ libs/keymanip.cu | 6 ++- libs/keymanip.cuh | 3 +- libs/precomp_data.h | 3 +- libs/sha512.cu | 118 ++++++++++++++++++++++++++++++++++---------- libs/string.cu | 18 ++++--- meson.build | 37 ++++++++------ native.ini | 4 ++ sources/main.cpp | 21 ++++---- 13 files changed, 231 insertions(+), 103 deletions(-) create mode 100644 native.ini diff --git a/README.md b/README.md index 5fc52b4..104774a 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ git clone https://rcxpony.name/rcxpony/yggm.git && cd yggm meson setup build -Dbuildtype=release && cd build meson compile -./yggm -t 10 // for CPU +./yggmc -t 10 // for CPU ./yggmcu -t 10 // for GPU ``` # ToDo diff --git a/libs/defines.h b/libs/defines.h index cb25afa..22313f0 100644 --- a/libs/defines.h +++ b/libs/defines.h @@ -11,7 +11,4 @@ #define THDIVTHPB (tTh / THREADSPB) #define WHCOND true #endif -#ifndef USE_AVX2 - #define USE_AVX2 0 -#endif #endif \ No newline at end of file diff --git a/libs/ed25519.cu b/libs/ed25519.cu index 8e936d4..cef78a9 100644 --- a/libs/ed25519.cu +++ b/libs/ed25519.cu @@ -1,6 +1,6 @@ #include -#include #include +#include void __host__ __device__ ge_madd(ge_p1p1& __restrict__ r, const ge_p3& __restrict__ p, const precomp_data& __restrict__ q) { fe t0; fe_add(r.X, p.Y, p.X); @@ -14,7 +14,7 @@ void __host__ __device__ ge_madd(ge_p1p1& __restrict__ r, const ge_p3& __restric fe_add(r.Z, t0, r.T); fe_sub(r.T, t0, r.T); } -//r.XYZ = p.XYZ * p.TZT +// r.XYZ = p.XYZ * p.TZT void __host__ __device__ ge_p1p1_to_p2(ge_p2& __restrict__ r, const ge_p1p1& __restrict__ p) { fe_mul(r.X, p.X, p.T); fe_mul(r.Y, p.Y, p.Z); diff --git a/libs/edsign.cu b/libs/edsign.cu index 5d57422..aae99ed 100644 --- a/libs/edsign.cu +++ b/libs/edsign.cu @@ -1,7 +1,6 @@ +#include #include -#include #include -#include __device__ __forceinline__ void expand_key(unsigned char* __restrict__ expanded, const unsigned char* __restrict__ secret) { struct sha512_state s; memcpy(&s, &sha512_initial_state, sizeof(s)); diff --git a/libs/f25519.cu b/libs/f25519.cu index 6d28411..5c297c9 100644 --- a/libs/f25519.cu +++ b/libs/f25519.cu @@ -1,5 +1,5 @@ #include -//h = {0}; +// h = {0}; void __device__ __host__ fe_0(fe& __restrict__ h) { h[0] = 0; h[1] = 0; @@ -12,7 +12,7 @@ void __device__ __host__ fe_0(fe& __restrict__ h) { h[8] = 0; h[9] = 0; } -//h = {1,0,0,0,0,0,0,0,0,0,0}; +// h = {1,0,0,0,0,0,0,0,0,0,0}; void __device__ __host__ fe_1(fe& __restrict__ h) { h[0] = 1; h[1] = 0; @@ -25,7 +25,7 @@ void __device__ __host__ fe_1(fe& __restrict__ h) { h[8] = 0; h[9] = 0; } -//h = f + g +// h = f + g void __device__ __host__ fe_add(fe& __restrict__ h, const fe& __restrict__ f, const fe& __restrict__ g) { long f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4], f5 = f[5], f6 = f[6], f7 = f[7], f8 = f[8], f9 = f[9]; long g0 = g[0], g1 = g[1], g2 = g[2], g3 = g[3], g4 = g[4], g5 = g[5], g6 = g[6], g7 = g[7], g8 = g[8], g9 = g[9]; @@ -57,7 +57,7 @@ void __device__ __host__ fe_cmov(fe& __restrict__ f, const fe& __restrict__ g, c f[8] = f8 ^ x8; f[9] = f9 ^ x9; } -//h[0..9] = f[0..9] +// h[0..9] = f[0..9] void __device__ __host__ fe_copy(fe& __restrict__ h, const fe& __restrict__ f) { long f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4], f5 = f[5], f6 = f[6], f7 = f[7], f8 = f[8], f9 = f[9]; h[0] = f0; @@ -80,11 +80,11 @@ void __device__ __host__ fe_invert(fe& __restrict__ out, const fe& __restrict__ fe_mul(t0, t0, t1); fe_mul(t2, t0, t0); fe_mul(t1, t1, t2); - fe_mul(t2, t1,t1); + fe_mul(t2, t1, t1); + fe_mul(t2, t2, t2); fe_mul(t2, t2, t2); fe_mul(t2, t2, t2); fe_mul(t2, t2, t2); - fe_mul(t2, t2,t2); fe_mul(t1, t2, t1); fe_mul(t2, t1, t1); fe_mul(t2, t2, t2); @@ -159,7 +159,7 @@ int __device__ __host__ fe_isnegative(const fe& __restrict__ f) { fe_tobytes(s, f); return s[0] & 1; } -//h = f * g +// h = f * g void __device__ __host__ fe_mul(fe& __restrict__ h, const fe& __restrict__ f, const fe& __restrict__ g) { long f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4], f5 = f[5], f6 = f[6], f7 = f[7], f8 = f[8], f9 = f[9]; long g0 = g[0], g1 = g[1], g2 = g[2], g3 = g[3], g4 = g[4], g5 = g[5], g6 = g[6], g7 = g[7], g8 = g[8], g9 = g[9]; @@ -183,18 +183,42 @@ void __device__ __host__ fe_mul(fe& __restrict__ h, const fe& __restrict__ f, co long h7 = f0g7 + f1g6 + f2g5 + f3g4 + f4g3 + f5g2 + f6g1 + f7g0 + f8g9_19 + f9g8_19; long h8 = f0g8 + f1g7_2 + f2g6 + f3g5_2 + f4g4 + f5g3_2 + f6g2 + f7g1_2 + f8g0 + f9g9_38; long h9 = f0g9 + f1g8 + f2g7 + f3g6 + f4g5 + f5g4 + f6g3 + f7g2 + f8g1 + f9g0; - long carry = (h0 + 33554432L) >> 26L; h1 += carry; h0 -= carry << 26L; - carry = (h4 + 33554432L) >> 26L; h5 += carry; h4 -= carry << 26L; - carry = (h1 + 16777216L) >> 25L; h2 += carry; h1 -= carry << 25L; - carry = (h5 + 16777216L) >> 25L; h6 += carry; h5 -= carry << 25L; - carry = (h2 + 33554432L) >> 26L; h3 += carry; h2 -= carry << 26L; - carry = (h6 + 33554432L) >> 26L; h7 += carry; h6 -= carry << 26L; - carry = (h3 + 16777216L) >> 25L; h4 += carry; h3 -= carry << 25L; - carry = (h7 + 16777216L) >> 25L; h8 += carry; h7 -= carry << 25L; - carry = (h4 + 33554432L) >> 26L; h5 += carry; h4 -= carry << 26L; - carry = (h8 + 33554432L) >> 26L; h9 += carry; h8 -= carry << 26L; - carry = (h9 + 16777216L) >> 25L; h0 += carry * 19L; h9 -= carry << 25L; - carry = (h0 + 33554432L) >> 26L; h1 += carry; h0 -= carry << 26L; + long carry = (h0 + 33554432L) >> 26L; + h1 += carry; + h0 -= carry << 26L; + carry = (h4 + 33554432L) >> 26L; + h5 += carry; + h4 -= carry << 26L; + carry = (h1 + 16777216L) >> 25L; + h2 += carry; + h1 -= carry << 25L; + carry = (h5 + 16777216L) >> 25L; + h6 += carry; + h5 -= carry << 25L; + carry = (h2 + 33554432L) >> 26L; + h3 += carry; + h2 -= carry << 26L; + carry = (h6 + 33554432L) >> 26L; + h7 += carry; + h6 -= carry << 26L; + carry = (h3 + 16777216L) >> 25L; + h4 += carry; + h3 -= carry << 25L; + carry = (h7 + 16777216L) >> 25L; + h8 += carry; + h7 -= carry << 25L; + carry = (h4 + 33554432L) >> 26L; + h5 += carry; + h4 -= carry << 26L; + carry = (h8 + 33554432L) >> 26L; + h9 += carry; + h8 -= carry << 26L; + carry = (h9 + 16777216L) >> 25L; + h0 += carry * 19L; + h9 -= carry << 25L; + carry = (h0 + 33554432L) >> 26L; + h1 += carry; + h0 -= carry << 26L; h[0] = static_cast(h0); h[1] = static_cast(h1); h[2] = static_cast(h2); @@ -206,7 +230,7 @@ void __device__ __host__ fe_mul(fe& __restrict__ h, const fe& __restrict__ f, co h[8] = static_cast(h8); h[9] = static_cast(h9); } -//h = -f +// h = -f void __host__ __device__ fe_neg(fe& __restrict__ h, const fe& __restrict__ f) { long f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4], f5 = f[5], f6 = f[6], f7 = f[7], f8 = f[8], f9 = f[9]; long h0 = -f0, h1 = -f1, h2 = -f2, h3 = -f3, h4 = -f4, h5 = -f5, h6 = -f6, h7 = -f7, h8 = -f8, h9 = -f9; @@ -245,18 +269,42 @@ void __host__ __device__ fe_mul2(fe& __restrict__ h, const fe& __restrict__ f) { long h7 = (f0f7_2 + f1f6_2 + f2f5_2 + f3f4_2 + f8f9_38) << 1; long h8 = (f0f8_2 + f1f7_4 + f2f6_2 + f3f5_4 + f4f4 + f9f9_38) << 1; long h9 = (f0f9_2 + f1f8_2 + f2f7_2 + f3f6_2 + f4f5_2) << 1; - long carry = (h0 + 33554432L) >> 26L; h1 += carry; h0 -= carry << 26L; - carry = (h4 + 33554432L) >> 26L; h5 += carry; h4 -= carry << 26L; - carry = (h1 + 16777216L) >> 25L; h2 += carry; h1 -= carry << 25L; - carry = (h5 + 16777216L) >> 25L; h6 += carry; h5 -= carry << 25L; - carry = (h2 + 33554432L) >> 26L; h3 += carry; h2 -= carry << 26L; - carry = (h6 + 33554432L) >> 26L; h7 += carry; h6 -= carry << 26L; - carry = (h3 + 16777216L) >> 25L; h4 += carry; h3 -= carry << 25L; - carry = (h7 + 16777216L) >> 25L; h8 += carry; h7 -= carry << 25L; - carry = (h4 + 33554432L) >> 26L; h5 += carry; h4 -= carry << 26L; - carry = (h8 + 33554432L) >> 26L; h9 += carry; h8 -= carry << 26L; - carry = (h9 + 16777216L) >> 25L; h0 += carry * 19L; h9 -= carry << 25L; - carry = (h0 + 33554432L) >> 26L; h1 += carry; h0 -= carry << 26L; + long carry = (h0 + 33554432L) >> 26L; + h1 += carry; + h0 -= carry << 26L; + carry = (h4 + 33554432L) >> 26L; + h5 += carry; + h4 -= carry << 26L; + carry = (h1 + 16777216L) >> 25L; + h2 += carry; + h1 -= carry << 25L; + carry = (h5 + 16777216L) >> 25L; + h6 += carry; + h5 -= carry << 25L; + carry = (h2 + 33554432L) >> 26L; + h3 += carry; + h2 -= carry << 26L; + carry = (h6 + 33554432L) >> 26L; + h7 += carry; + h6 -= carry << 26L; + carry = (h3 + 16777216L) >> 25L; + h4 += carry; + h3 -= carry << 25L; + carry = (h7 + 16777216L) >> 25L; + h8 += carry; + h7 -= carry << 25L; + carry = (h4 + 33554432L) >> 26L; + h5 += carry; + h4 -= carry << 26L; + carry = (h8 + 33554432L) >> 26L; + h9 += carry; + h8 -= carry << 26L; + carry = (h9 + 16777216L) >> 25L; + h0 += carry * 19L; + h9 -= carry << 25L; + carry = (h0 + 33554432L) >> 26L; + h1 += carry; + h0 -= carry << 26L; h[0] = static_cast(h0); h[1] = static_cast(h1); h[2] = static_cast(h2); diff --git a/libs/keymanip.cu b/libs/keymanip.cu index da194c9..7ab2d9a 100644 --- a/libs/keymanip.cu +++ b/libs/keymanip.cu @@ -20,7 +20,9 @@ __device__ ds46 getAddr(const Addr16 rawAddr) noexcept { addrStr.data[pos++] = hexDigits[rawAddr[idx] & 0x0F]; addrStr.data[pos++] = hexDigits[rawAddr[idx + 1] >> 4]; addrStr.data[pos++] = hexDigits[rawAddr[idx + 1] & 0x0F]; - if (group < 7) { addrStr.data[pos++] = ':'; } + if (group < 7) { + addrStr.data[pos++] = ':'; + } } addrStr.data[pos] = '\0'; return addrStr; @@ -30,7 +32,7 @@ __device__ void getRawAddress(int lErase, Key32& InvertedPublicKey, Addr16& rawA const int bitsToShift = lErase & 7; const int start = lErase >> 3; if (bitsToShift) { - #pragma unroll +#pragma unroll for (int i = start; i < start + 15; i++) { InvertedPublicKey[i] = static_cast((InvertedPublicKey[i] << bitsToShift) | (InvertedPublicKey[i + 1] >> (8 - bitsToShift))); } diff --git a/libs/keymanip.cuh b/libs/keymanip.cuh index 12b7e28..a905e7a 100644 --- a/libs/keymanip.cuh +++ b/libs/keymanip.cuh @@ -10,7 +10,8 @@ using Addr16 = unsigned char[16]; using Key32 = unsigned char[32]; struct KeysBox32 { Key32 PublicKey, PrivateKey; -};__device__ ds64 ktos(const unsigned char* key) noexcept; +}; +__device__ ds64 ktos(const unsigned char* key) noexcept; __device__ ds46 getAddr(const Addr16 rawAddr) noexcept; __device__ void getRawAddress(int lErase, Key32& InvertedPublicKey, Addr16& rawAddr) noexcept; __device__ void invertKey(const unsigned char* key, unsigned char* inverted); diff --git a/libs/precomp_data.h b/libs/precomp_data.h index 29619f7..d609186 100644 --- a/libs/precomp_data.h +++ b/libs/precomp_data.h @@ -1,5 +1,6 @@ #include -__device__ __constant__ const precomp_data Bi[8] = { { +__device__ __constant__ const precomp_data Bi[8] = { + { { 25967493, -14356035, 29566456, 3660896, -12694345, 4014787, 27544626, -11754271, -6079156, 2047605 }, { -12545711, 934262, -2722910, 3049990, -727428, 9406986, 12720692, 5043384, 19500929, -15469378 }, { -8738181, 4489570, 9688441, -14785194, 10184609, -12363380, 29287919, 11864899, -24514362, -4438546 }, diff --git a/libs/sha512.cu b/libs/sha512.cu index 2ec01d7..46f7640 100644 --- a/libs/sha512.cu +++ b/libs/sha512.cu @@ -1,38 +1,103 @@ #include __device__ __constant__ sha512_state sha512_initial_state = { { - 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, - 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, - 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, - 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL, + 0x6a09e667f3bcc908ULL, + 0xbb67ae8584caa73bULL, + 0x3c6ef372fe94f82bULL, + 0xa54ff53a5f1d36f1ULL, + 0x510e527fade682d1ULL, + 0x9b05688c2b3e6c1fULL, + 0x1f83d9abfb41bd6bULL, + 0x5be0cd19137e2179ULL, } }; __device__ __constant__ unsigned long round_k[80] = { - 0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, 0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL, - 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL, 0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL, - 0xd807aa98a3030242ULL, 0x12835b0145706fbeULL, 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL, - 0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, 0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL, - 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL, 0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL, - 0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL, 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL, - 0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, 0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL, - 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL, 0x06ca6351e003826fULL, 0x142929670a0e6e70ULL, - 0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL, 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL, - 0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, 0x81c2c92e47edaee6ULL, 0x92722c851482353bULL, - 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL, 0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL, - 0xd192e819d6ef5218ULL, 0xd69906245565a910ULL, 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL, - 0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, 0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL, - 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL, 0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL, - 0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL, 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL, - 0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, 0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL, - 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL, 0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL, - 0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL, 0x113f9804bef90daeULL, 0x1b710b35131c471bULL, - 0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, 0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL, - 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL, 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL, + 0x428a2f98d728ae22ULL, + 0x7137449123ef65cdULL, + 0xb5c0fbcfec4d3b2fULL, + 0xe9b5dba58189dbbcULL, + 0x3956c25bf348b538ULL, + 0x59f111f1b605d019ULL, + 0x923f82a4af194f9bULL, + 0xab1c5ed5da6d8118ULL, + 0xd807aa98a3030242ULL, + 0x12835b0145706fbeULL, + 0x243185be4ee4b28cULL, + 0x550c7dc3d5ffb4e2ULL, + 0x72be5d74f27b896fULL, + 0x80deb1fe3b1696b1ULL, + 0x9bdc06a725c71235ULL, + 0xc19bf174cf692694ULL, + 0xe49b69c19ef14ad2ULL, + 0xefbe4786384f25e3ULL, + 0x0fc19dc68b8cd5b5ULL, + 0x240ca1cc77ac9c65ULL, + 0x2de92c6f592b0275ULL, + 0x4a7484aa6ea6e483ULL, + 0x5cb0a9dcbd41fbd4ULL, + 0x76f988da831153b5ULL, + 0x983e5152ee66dfabULL, + 0xa831c66d2db43210ULL, + 0xb00327c898fb213fULL, + 0xbf597fc7beef0ee4ULL, + 0xc6e00bf33da88fc2ULL, + 0xd5a79147930aa725ULL, + 0x06ca6351e003826fULL, + 0x142929670a0e6e70ULL, + 0x27b70a8546d22ffcULL, + 0x2e1b21385c26c926ULL, + 0x4d2c6dfc5ac42aedULL, + 0x53380d139d95b3dfULL, + 0x650a73548baf63deULL, + 0x766a0abb3c77b2a8ULL, + 0x81c2c92e47edaee6ULL, + 0x92722c851482353bULL, + 0xa2bfe8a14cf10364ULL, + 0xa81a664bbc423001ULL, + 0xc24b8b70d0f89791ULL, + 0xc76c51a30654be30ULL, + 0xd192e819d6ef5218ULL, + 0xd69906245565a910ULL, + 0xf40e35855771202aULL, + 0x106aa07032bbd1b8ULL, + 0x19a4c116b8d2d0c8ULL, + 0x1e376c085141ab53ULL, + 0x2748774cdf8eeb99ULL, + 0x34b0bcb5e19b48a8ULL, + 0x391c0cb3c5c95a63ULL, + 0x4ed8aa4ae3418acbULL, + 0x5b9cca4f7763e373ULL, + 0x682e6ff3d6b2b8a3ULL, + 0x748f82ee5defb2fcULL, + 0x78a5636f43172f60ULL, + 0x84c87814a1f0ab72ULL, + 0x8cc702081a6439ecULL, + 0x90befffa23631e28ULL, + 0xa4506cebde82bde9ULL, + 0xbef9a3f7b2c67915ULL, + 0xc67178f2e372532bULL, + 0xca273eceea26619cULL, + 0xd186b8c721c0c207ULL, + 0xeada7dd6cde0eb1eULL, + 0xf57d4f7fee6ed178ULL, + 0x06f067aa72176fbaULL, + 0x0a637dc5a2c898a6ULL, + 0x113f9804bef90daeULL, + 0x1b710b35131c471bULL, + 0x28db77f523047d84ULL, + 0x32caab7b40c72493ULL, + 0x3c9ebe0a15c9bebcULL, + 0x431d67c49c100d4cULL, + 0x4cc5d4becb3e42b6ULL, + 0x597f299cfc657e2aULL, + 0x5fcb6fab3ad6faecULL, + 0x6c44198c4a475817ULL, }; __device__ __forceinline__ unsigned long load64(const unsigned char* __restrict__ x) { return (static_cast(x[0]) << 56) | (static_cast(x[1]) << 48) | (static_cast(x[2]) << 40) | (static_cast(x[3]) << 32) | (static_cast(x[4]) << 24) | (static_cast(x[5]) << 16) | (static_cast(x[6]) << 8) | (static_cast(x[7])); } __device__ inline void store64(unsigned char* __restrict__ x, unsigned long v) { #pragma unroll 8 - for (unsigned char i = 0; i < 8; i++) x[i] = static_cast(v >> (56 - i * 8)); + for (unsigned char i = 0; i < 8; i++) + x[i] = static_cast(v >> (56 - i * 8)); } #define rot64(x, bits) ((x >> bits) | (x << (64 - bits))) __device__ void sha512_block(sha512_state* __restrict__ s, const unsigned char* __restrict__ blk) { @@ -94,7 +159,8 @@ __device__ void sha512_final(sha512_state* __restrict__ s, const unsigned char* } __device__ void sha512_get(const sha512_state* __restrict__ s, unsigned char* __restrict__ hash) { unsigned len = 64; - if (len > 128) len = 128; + if (len > 128) + len = 128; unsigned i = 0, c = (len < 8) ? len : 8; store64(hash, s->h[i]); hash += c; diff --git a/libs/string.cu b/libs/string.cu index f49dda9..687b0de 100644 --- a/libs/string.cu +++ b/libs/string.cu @@ -1,18 +1,22 @@ __device__ int cstring_length(const char* s) { int len = 0; - while (s[len]) len++; + while (s[len]) + len++; return len; } __device__ int cstring_find(const char* s, const char* sub) { int i, j; int n = cstring_length(s); int m = cstring_length(sub); - if (m == 0) return 0; + if (m == 0) + return 0; for (i = 0; i <= n - m; i++) { for (j = 0; j < m; j++) { - if (s[i + j] != sub[j]) break; + if (s[i + j] != sub[j]) + break; } - if (j == m) return i; + if (j == m) + return i; } return -1; } @@ -22,7 +26,8 @@ __device__ int cstring_to_ull(const char* s, unsigned* val) { if (s[0] == '0' && (s[1] == 'x' || s[1] == 'X')) { i = 2; } - if (s[i] == '\0') return 1; + if (s[i] == '\0') + return 1; for (; s[i]; i++) { char c = s[i]; int digit; @@ -60,7 +65,8 @@ __device__ void concat(const char* s1, const char* s2, char* out, int outSize) { } while (s2[j] && i < outSize - 1) { out[i] = s2[j]; - i++; j++; + i++; + j++; } out[i] = '\0'; } diff --git a/meson.build b/meson.build index 927555f..53264db 100644 --- a/meson.build +++ b/meson.build @@ -1,54 +1,59 @@ - -project('yggm', ['cpp', 'cuda'], +project( + 'yggm', + ['cpp', 'cuda'], version: '20.08.2025', - default_options: ['cpp_std=c++20'] + default_options: ['cpp_std=c++20'], ) +cpp = meson.get_compiler('cpp') +cuda = meson.get_compiler('cuda') + common_cpp_flags = [ '-march=native', + '-ffast-math', '-pipe', + '-funroll-loops', '-Wpedantic', '-Wconversion', '-Wuninitialized', '-Wsign-conversion', - '-masm=intel' + '-masm=intel', + '-I../libs/', ] common_cuda_flags = [ '-rdc=true', - '-Xptxas', '-O3', + '-Xptxas', + '-O3', '-use_fast_math', '-ftz=true', '-prec-div=false', '-prec-sqrt=false', '-gencode', 'arch=compute_75,code=sm_75', '-Wno-deprecated-gpu-targets', - '--default-stream', 'per-thread', '--expt-relaxed-constexpr', '-I../libs/', - '-I/opt/cuda/include', ] add_project_arguments(common_cpp_flags, language: 'cpp') add_project_arguments(common_cuda_flags, language: 'cuda') if get_option('buildtype') == 'release' - add_project_arguments(['-DRELEASE'], language: ['cpp', 'cuda']) + add_project_arguments(['-DRELEASE', '-DAVX2'], language: ['cpp', 'cuda']) elif get_option('buildtype') == 'debug' - add_project_arguments(['-DDEBUG', '-g', '-O0'], language: ['cpp', 'cuda']) + add_project_arguments(['-DDEBUG', '-g', '-O0', '-DAVX2'], language: ['cpp', 'cuda']) endif -cpp_sources = ['sources/main.cpp'] +cpp_sources = ['sources/main.cpp'] + files(run_command('find', 'libs', '-name','*.h').stdout().split()) cuda_sources = ['sources/main.cu'] + files(run_command('find', 'libs', '-name', '*.cu').stdout().split()) -executable('yggmcu', +executable( + 'yggmcu', sources: cuda_sources, - install: true, - dependencies: dependency('cuda', modules: ['cudart'], required: false) ) -executable('yggm', +executable( + 'yggmc', sources: cpp_sources, - install: true, - dependencies: dependency('libsodium') + dependencies: dependency('libsodium'), ) \ No newline at end of file diff --git a/native.ini b/native.ini new file mode 100644 index 0000000..b20af77 --- /dev/null +++ b/native.ini @@ -0,0 +1,4 @@ +[binaries] +cpp = 'clang++' +cuda = 'nvcc' +cuda_host_compiler = 'gcc-14' \ No newline at end of file diff --git a/sources/main.cpp b/sources/main.cpp index a67301f..611b865 100644 --- a/sources/main.cpp +++ b/sources/main.cpp @@ -1,6 +1,6 @@ +#include "defines.h" #include #include -#include "../libs/defines.h" #include #include #include @@ -80,7 +80,7 @@ void getRawAddress(unsigned lErase, Key& InvertedPublicKey, Address& rawAddr) { memcpy(&rawAddr[2], &InvertedPublicKey[start], 14); } inline void invertKey(const unsigned char* __restrict key, Key& inverted) { -#if USE_AVX2 +#ifdef __AVX2__ _mm256_storeu_si256(reinterpret_cast<__m256i*>(inverted), _mm256_xor_si256(_mm256_loadu_si256(reinterpret_cast(key)), _mm256_set1_epi8(0xFF))); #else for (int i = 0; i < 32; ++i) { @@ -102,17 +102,16 @@ inline unsigned getZeros(const Key& v) { return leadZeros; } void miner_thread() { - alignas(32) Key inv; - KeysBox keys; - Address rawAddr; + alignas(32) thread_local Key inv; + thread_local KeysBox keys; + thread_local Address rawAddr; while (true) { crypto_sign_ed25519_keypair(keys.PublicKey, keys.PrivateKey); - unsigned ones = getZeros(keys.PublicKey); - unsigned current_high = conf.high.load(std::memory_order_relaxed); - while (ones > current_high && !conf.high.compare_exchange_strong(current_high, ones, std::memory_order_relaxed)) { - current_high = conf.high.load(std::memory_order_relaxed); + unsigned ones = getZeros(keys.PublicKey), high = conf.high.load(std::memory_order_relaxed); + while (ones > high && !conf.high.compare_exchange_strong(high, ones, std::memory_order_relaxed)) { + high = conf.high.load(std::memory_order_relaxed); } - if (ones > current_high) { + if (ones > high) { invertKey(keys.PublicKey, inv); getRawAddress(ones, inv, rawAddr); printf("\nIPv6:\t%s\nPK:\t%s\nSK:\t%s\n", getAddress(rawAddr).c_str(), KeyToString(keys.PublicKey).c_str(), KeyToString(keys.PrivateKey).c_str()); @@ -129,7 +128,7 @@ void startThreads() { } } int main(int argc, char* argv[]) { - printf("BuildType: %s\nAVX2: %d\n", __BUILDTYPE__, USE_AVX2); + std::cout << "BuildType: " << __BUILDTYPE__ << std::endl << "AVX2: " << __AVX2__ << std::endl; if (argc < 1) return 0; for (int x = 0; x < argc; x++) {