From cfecea84ec354d3e8015f54000e1c2bbe53fd76d Mon Sep 17 00:00:00 2001
From: rcxpony <rcxpony@rcxpony.name>
Date: Sat, 15 Mar 2025 15:54:52 +0500
Subject: [PATCH] fixes

---
 libs/ed25519.cu   | 8 ++++----
 libs/keymanip.cu  | 6 ++++--
 libs/keymanip.cuh | 2 +-
 libs/sha512.cu    | 8 +++-----
 sources/main.cu   | 3 ++-
 5 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/libs/ed25519.cu b/libs/ed25519.cu
index 4242d3c..8dd44bb 100644
--- a/libs/ed25519.cu
+++ b/libs/ed25519.cu
@@ -18,7 +18,7 @@ __device__ __constant__ unsigned char ed25519_k[32] = {
     0x30,0xd1,0xf3,0xee,0xf2,0x80,0x8e,0x19,0xe7,0xfc,0xdf,0x56,0xdc,0xd9,0x06,0x24
 };
 __device__ void ed25519_unproject(unsigned char* __restrict__ x, unsigned char* __restrict__ y, const struct ed25519_pt* __restrict__ p) {
-    unsigned char z1[32];
+    unsigned char __align__(32) z1[32];
     f25519_inv__distinct(z1, p->z);
     f25519_mul__distinct(x, p->x, z1);
     f25519_mul__distinct(y, p->y, z1);
@@ -26,7 +26,7 @@ __device__ void ed25519_unproject(unsigned char* __restrict__ x, unsigned char*
     f25519_normalize(y);
 }
 __device__ void ed25519_pack(unsigned char* __restrict__ c, const unsigned char* __restrict__ x, const unsigned char* __restrict__ y) {
-    unsigned char tmp[32];
+    unsigned char __align__(32) tmp[32];
     unsigned char parity;
     f25519_copy(tmp, x);
     f25519_normalize(tmp);
@@ -36,7 +36,7 @@ __device__ void ed25519_pack(unsigned char* __restrict__ c, const unsigned char*
     c[31] |= parity;
 }
 __device__ __forceinline__ void ed25519_add(struct ed25519_pt* __restrict__ r, const struct ed25519_pt* __restrict__ p1, const struct ed25519_pt* __restrict__ p2) {
-    unsigned char a[32], b[32], c[32], d[32], e[32], f[32], g[32], h[32];
+    unsigned char __align__(32) a[32], __align__(32) b[32], __align__(32) c[32], __align__(32) d[32], __align__(32) e[32], __align__(32) f[32], __align__(32) g[32], __align__(32) h[32];
     f25519_sub(c, p1->y, p1->x);
     f25519_sub(d, p2->y, p2->x);
     f25519_mul__distinct(a, c, d);
@@ -57,7 +57,7 @@ __device__ __forceinline__ void ed25519_add(struct ed25519_pt* __restrict__ r, c
     f25519_mul__distinct(r->z, f, g);
 }
 __device__ __forceinline__ void ed25519_double(struct ed25519_pt* __restrict__ r, const struct ed25519_pt* __restrict__ p) {
-    unsigned char a[32], b[32], c[32], e[32], f[32], g[32], h[32];
+    unsigned char __align__(32) a[32], __align__(32) b[32], __align__(32) c[32], __align__(32) e[32], __align__(32) f[32], __align__(32) g[32], __align__(32) h[32];
     f25519_mul__distinct(a, p->x, p->x);
     f25519_mul__distinct(b, p->y, p->y);
     f25519_mul__distinct(c, p->z, p->z);
diff --git a/libs/keymanip.cu b/libs/keymanip.cu
index e8fef94..6d2984d 100644
--- a/libs/keymanip.cu
+++ b/libs/keymanip.cu
@@ -41,6 +41,8 @@ __device__ void getRawAddress(int lErase, Key32& InvertedPublicKey, Addr16& rawA
     memcpy(&rawAddr[2], &InvertedPublicKey[start], 14);
 }
 __device__ void invertKey(const unsigned char* key, unsigned char* inverted) {
-#pragma unroll 32
-    for (unsigned char i = 0; i < 32; i++) inverted[i] = key[i] ^ 0xFF;
+    for (int i = 0; i < 32; i += 4) {
+        uchar4 k = *(reinterpret_cast<const uchar4*>(&key[i]));
+        *(reinterpret_cast<uchar4*>(&inverted[i])) = make_uchar4(k.x ^ 0xFF, k.y ^ 0xFF, k.z ^ 0xFF, k.w ^ 0xFF);
+    }
 }
\ No newline at end of file
diff --git a/libs/keymanip.cuh b/libs/keymanip.cuh
index 907eb07..8b1b19a 100644
--- a/libs/keymanip.cuh
+++ b/libs/keymanip.cuh
@@ -8,7 +8,7 @@ struct ds46 {
 };
 using Addr16 = unsigned char[16];
 using Key32 = unsigned char[32];
-struct KeysBox32 {
+struct __align__(32) KeysBox32 {
     Key32 PublicKey;
     Key32 PrivateKey;
 };
diff --git a/libs/sha512.cu b/libs/sha512.cu
index b9330be..0d0ae26 100644
--- a/libs/sha512.cu
+++ b/libs/sha512.cu
@@ -109,9 +109,7 @@ __device__ void sha512_get(const sha512_state* s, unsigned char* hash) {
         len -= 8;
         i++;
     }
-    if (len > 0) {
-        unsigned char tmp[8];
-        store64(tmp, s->h[i]);
-        memcpy(hash, tmp, len);
-    }
+    unsigned char tmp[8];
+    store64(tmp, s->h[i]);
+    memcpy(hash, tmp, len);
 }
\ No newline at end of file
diff --git a/sources/main.cu b/sources/main.cu
index 9a666aa..6dedef8 100644
--- a/sources/main.cu
+++ b/sources/main.cu
@@ -84,9 +84,10 @@ __device__ __forceinline__ void rmbytes(unsigned char* buf, curandState* state)
 __global__ void KeyGen(curandState* randStates) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
     curandState localState = randStates[idx];
+    //printf("Seed: %s\n", ktos(seed).data);
     while (true) {
-        KeysBox32 keys;
         Key32 seed;
+        KeysBox32 keys;
         rmbytes(seed, &localState);
         ed25519_keygen(keys.PrivateKey, keys.PublicKey, seed);
         if (unsigned zeros = getZeros(keys.PublicKey); zeros > atomicMax((unsigned*)&d_high, zeros)) {