test

2025-03-13 04:09:27 +05:00 · 2025-03-13 04:09:27 +05:00 · b95318103a
commit b95318103a
parent 4c8e35e0f8
8 changed files with 1021 additions and 32 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,5 +1,3 @@
 build/
 .vscode/
-libs/
-*.o
-*.cu
+*.o
--- a/libs/ed25519.cuh
+++ b/libs/ed25519.cuh
@ -0,0 +1,139 @@
+#ifndef __ED25519_CUH
+#define __ED25519_CUH
+#include <stdint.h>
+#include <f25519.cuh>
+#define F25519_SIZE 32
+struct ed25519_pt {
+    uint8_t x[F25519_SIZE], y[F25519_SIZE], t[F25519_SIZE], z[F25519_SIZE];
+};
+__device__ __constant__ struct ed25519_pt ed25519_base = {
+    {0x1a,0xd5,0x25,0x8f,0x60,0x2d,0x56,0xc9,0xb2,0xa7,0x25,0x95,0x60,0xc7,0x2c,0x69,
+     0x5c,0xdc,0xd6,0xfd,0x31,0xe2,0xa4,0xc0,0xfe,0x53,0x6e,0xcd,0xd3,0x36,0x69,0x21},
+    {0x58,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,
+     0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66},
+    {0xa3,0xdd,0xb7,0xa5,0xb3,0x8a,0xde,0x6d,0xf5,0x52,0x51,0x77,0x80,0x9f,0xf0,0x20,
+     0x7d,0xe3,0xab,0x64,0x8e,0x4e,0xea,0x66,0x65,0x76,0x8b,0xd7,0x0f,0x5f,0x87,0x67},
+    {1,0}
+};
+__device__ __constant__ struct ed25519_pt ed25519_neutral = {
+    {0}, {1,0}, {0}, {1,0}
+};
+__device__ __constant__ uint8_t ed25519_d[F25519_SIZE] = {
+    0xa3,0x78,0x59,0x13,0xca,0x4d,0xeb,0x75,0xab,0xd8,0x41,0x41,0x4d,0x0a,0x70,0x00,
+    0x98,0xe8,0x79,0x77,0x79,0x40,0xc7,0x8c,0x73,0xfe,0x6f,0x2b,0xee,0x6c,0x03,0x52
+};
+__device__ __constant__ uint8_t ed25519_k[F25519_SIZE] = {
+    0x59,0xf1,0xb2,0x26,0x94,0x9b,0xd6,0xeb,0x56,0xb1,0x83,0x82,0x9a,0x14,0xe0,0x00,
+    0x30,0xd1,0xf3,0xee,0xf2,0x80,0x8e,0x19,0xe7,0xfc,0xdf,0x56,0xdc,0xd9,0x06,0x24
+};
+__device__ __forceinline__ void ed25519_project(struct ed25519_pt* p, const uint8_t* x, const uint8_t* y) {
+    f25519_copy(p->x, x);
+    f25519_copy(p->y, y);
+    f25519_load(p->z, 1);
+    f25519_mul__distinct(p->t, x, y);
+}
+__device__ __forceinline__ void ed25519_unproject(uint8_t* x, uint8_t* y, const struct ed25519_pt* p) {
+    uint8_t z1[F25519_SIZE];
+    f25519_inv__distinct(z1, p->z);
+    f25519_mul__distinct(x, p->x, z1);
+    f25519_mul__distinct(y, p->y, z1);
+    f25519_normalize(x);
+    f25519_normalize(y);
+}
+__device__ __forceinline__ void ed25519_pack(uint8_t* c, const uint8_t* x, const uint8_t* y) {
+    uint8_t tmp[F25519_SIZE];
+    uint8_t parity;
+    f25519_copy(tmp, x);
+    f25519_normalize(tmp);
+    parity = (tmp[0] & 1) << 7;
+    f25519_copy(c, y);
+    f25519_normalize(c);
+    c[31] |= parity;
+}
+__device__ __forceinline__ uint8_t ed25519_try_unpack(uint8_t* x, uint8_t* y, const uint8_t* comp) {
+    int parity = comp[31] >> 7;
+    uint8_t a[F25519_SIZE], b[F25519_SIZE], c_[F25519_SIZE];
+    f25519_copy(y, comp);
+    y[31] &= 127;
+    f25519_mul__distinct(c_, y, y);
+    f25519_mul__distinct(b, c_, ed25519_d);
+    f25519_add(a, b, f25519_one);
+    f25519_inv__distinct(b, a);
+    f25519_sub(a, c_, f25519_one);
+    f25519_mul__distinct(c_, a, b);
+    f25519_sqrt(a, c_);
+    f25519_neg(b, a);
+    f25519_select(x, a, b, (a[0] ^ parity) & 1);
+    f25519_mul__distinct(a, x, x);
+    f25519_normalize(a);
+    f25519_normalize(c_);
+    return f25519_eq(a, c_);
+}
+__device__ __forceinline__ void ed25519_add(struct ed25519_pt* r, const struct ed25519_pt* p1, const struct ed25519_pt* p2) {
+    uint8_t a[F25519_SIZE], b[F25519_SIZE], c[F25519_SIZE], d[F25519_SIZE];
+    uint8_t e[F25519_SIZE], f[F25519_SIZE], g[F25519_SIZE], h[F25519_SIZE];
+    f25519_sub(c, p1->y, p1->x);
+    f25519_sub(d, p2->y, p2->x);
+    f25519_mul__distinct(a, c, d);
+    f25519_add(c, p1->y, p1->x);
+    f25519_add(d, p2->y, p2->x);
+    f25519_mul__distinct(b, c, d);
+    f25519_mul__distinct(d, p1->t, p2->t);
+    f25519_mul__distinct(c, d, ed25519_k);
+    f25519_mul__distinct(d, p1->z, p2->z);
+    f25519_add(d, d, d);
+    f25519_sub(e, b, a);
+    f25519_sub(f, d, c);
+    f25519_add(g, d, c);
+    f25519_add(h, b, a);
+    f25519_mul__distinct(r->x, e, f);
+    f25519_mul__distinct(r->y, g, h);
+    f25519_mul__distinct(r->t, e, h);
+    f25519_mul__distinct(r->z, f, g);
+}
+__device__ __forceinline__ void ed25519_double(struct ed25519_pt* r, const struct ed25519_pt* p) {
+    uint8_t a[F25519_SIZE], b[F25519_SIZE], c[F25519_SIZE];
+    uint8_t e[F25519_SIZE], f[F25519_SIZE], g[F25519_SIZE], h[F25519_SIZE];
+    f25519_mul__distinct(a, p->x, p->x);
+    f25519_mul__distinct(b, p->y, p->y);
+    f25519_mul__distinct(c, p->z, p->z);
+    f25519_add(c, c, c);
+    f25519_add(f, p->x, p->y);
+    f25519_mul__distinct(e, f, f);
+    f25519_sub(e, e, a);
+    f25519_sub(e, e, b);
+    f25519_sub(g, b, a);
+    f25519_sub(f, g, c);
+    f25519_neg(h, b);
+    f25519_sub(h, h, a);
+    f25519_mul__distinct(r->x, e, f);
+    f25519_mul__distinct(r->y, g, h);
+    f25519_mul__distinct(r->t, e, h);
+    f25519_mul__distinct(r->z, f, g);
+}
+__device__ __forceinline__ void ed25519_copy(struct ed25519_pt* dst, const struct ed25519_pt* src) {
+    f25519_copy(dst->x, src->x);
+    f25519_copy(dst->y, src->y);
+    f25519_copy(dst->t, src->t);
+    f25519_copy(dst->z, src->z);
+}
+__device__ __forceinline__ void ed25519_smult(struct ed25519_pt* r_out, const struct ed25519_pt* p, const uint8_t* e) {
+    struct ed25519_pt r = ed25519_neutral;
+    for (int i = 255; i >= 0; i--) {
+        struct ed25519_pt s;
+        ed25519_double(&r, &r);
+        ed25519_add(&s, &r, p);
+        uint8_t bit = (e[i >> 3] >> (i & 7)) & 1;
+        f25519_select(r.x, r.x, s.x, bit);
+        f25519_select(r.y, r.y, s.y, bit);
+        f25519_select(r.z, r.z, s.z, bit);
+        f25519_select(r.t, r.t, s.t, bit);
+    }
+    ed25519_copy(r_out, &r);
+}
+__device__ __forceinline__ void ed25519_prepare(uint8_t* e) {
+    e[0] &= 0xf8;
+    e[31] &= 0x7f;
+    e[31] |= 0x40;
+}
+#endif
--- a/libs/edsign.cuh
+++ b/libs/edsign.cuh
@ -0,0 +1,120 @@
+#ifndef __EDSIGN_CUH
+#define __EDSIGN_CUH
+#include <ed25519.cuh>
+#ifndef COMPACT_DISABLE_ED25519
+#include <sha512.cuh>
+#include <fprime.cuh>
+#include <cuda_runtime.h>
+#include <cstdio>
+#include <cstring>
+#define EXPANDED_SIZE         64
+#define EDSIGN_SECRET_KEY_SIZE 32
+#define EDSIGN_PUBLIC_KEY_SIZE 32
+#define EDSIGN_SIGNATURE_SIZE  64
+#define SHA512_HASH_SIZE       64
+__device__ __constant__ uint8_t ed25519_order[FPRIME_SIZE] = {
+    0xed, 0xd3, 0xf5, 0x5c, 0x1a, 0x63, 0x12, 0x58,
+    0xd6, 0x9c, 0xf7, 0xa2, 0xde, 0xf9, 0xde, 0x14,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10
+};
+__device__ __forceinline__ void expand_key(uint8_t* expanded, const uint8_t* secret) {
+    struct sha512_state s;
+    sha512_init(&s);
+    sha512_final(&s, secret, EDSIGN_SECRET_KEY_SIZE);
+    sha512_get(&s, expanded, 0, EXPANDED_SIZE);
+    ed25519_prepare(expanded);
+}
+__device__ __forceinline__ uint8_t upp(struct ed25519_pt* p, const uint8_t* packed) {
+    uint8_t x[F25519_SIZE], y[F25519_SIZE];
+    uint8_t ok = ed25519_try_unpack(x, y, packed);
+    ed25519_project(p, x, y);
+    return ok;
+}
+__device__ __forceinline__ void pp(uint8_t* packed, const struct ed25519_pt* p) {
+    uint8_t x[F25519_SIZE], y[F25519_SIZE];
+    ed25519_unproject(x, y, p);
+    ed25519_pack(packed, x, y);
+}
+__device__ __forceinline__ void sm_pack(uint8_t* r, const uint8_t* k) {
+    struct ed25519_pt p;
+    ed25519_smult(&p, &ed25519_base, k);
+    pp(r, &p);
+}
+__device__ __forceinline__ void edsign_sec_to_pub(uint8_t* pub, const uint8_t* secret) {
+    uint8_t expanded[EXPANDED_SIZE];
+    expand_key(expanded, secret);
+    sm_pack(pub, expanded);
+}
+__device__ __forceinline__ void hash_with_prefix(uint8_t* out_fp, uint8_t* init_block, unsigned int prefix_size, const uint8_t* message, size_t len) {
+    struct sha512_state s;
+    sha512_init(&s);
+    if (len < SHA512_BLOCK_SIZE && len + prefix_size < SHA512_BLOCK_SIZE) {
+        memcpy(init_block + prefix_size, message, len);
+        sha512_final(&s, init_block, len + prefix_size);
+    } else {
+        size_t i;
+        memcpy(init_block + prefix_size, message, SHA512_BLOCK_SIZE - prefix_size);
+        sha512_block(&s, init_block);
+        for (i = SHA512_BLOCK_SIZE - prefix_size; i + SHA512_BLOCK_SIZE <= len; i += SHA512_BLOCK_SIZE) {
+            sha512_block(&s, message + i);
+        }
+        sha512_final(&s, message + i, len - i + prefix_size);
+    }
+    sha512_get(&s, init_block, 0, SHA512_HASH_SIZE);
+    fprime_from_bytes(out_fp, init_block, SHA512_HASH_SIZE, ed25519_order);
+}
+__device__ __forceinline__ void generate_k(uint8_t* k, const uint8_t* kgen_key, const uint8_t* message, size_t len) {
+    uint8_t block[SHA512_BLOCK_SIZE];
+    memcpy(block, kgen_key, 32);
+    hash_with_prefix(k, block, 32, message, len);
+}
+__device__ __forceinline__ void hash_message(uint8_t* z, const uint8_t* r, const uint8_t* a, const uint8_t* m, size_t len) {
+    uint8_t block[SHA512_BLOCK_SIZE];
+    memcpy(block, r, 32);
+    memcpy(block + 32, a, 32);
+    hash_with_prefix(z, block, 64, m, len);
+}
+__device__ void edsign_sign(uint8_t* signature, const uint8_t* pub, const uint8_t* secret, const uint8_t* message, size_t len) {
+    uint8_t expanded[EXPANDED_SIZE];
+    uint8_t e[FPRIME_SIZE], s[FPRIME_SIZE], k[FPRIME_SIZE], z[FPRIME_SIZE];
+    expand_key(expanded, secret);
+    generate_k(k, expanded + 32, message, len);
+    sm_pack(signature, k);
+    hash_message(z, signature, pub, message, len);
+    fprime_from_bytes(e, expanded, 32, ed25519_order);
+    fprime_mul(s, z, e, ed25519_order);
+    fprime_add(s, k, ed25519_order);
+    memcpy(signature + 32, s, 32);
+}
+__device__ uint8_t edsign_verify(const uint8_t* signature, const uint8_t* pub, const uint8_t* message, size_t len) {
+    struct ed25519_pt p, q;
+    uint8_t lhs[F25519_SIZE], rhs[F25519_SIZE], z[FPRIME_SIZE];
+    uint8_t ok = 1;
+    hash_message(z, signature, pub, message, len);
+    sm_pack(lhs, signature + 32);
+    ok &= upp(&p, pub);
+    ed25519_smult(&p, &p, z);
+    ok &= upp(&q, signature);
+    ed25519_add(&p, &p, &q);
+    pp(rhs, &p);
+    return ok & f25519_eq(lhs, rhs);
+}
+__global__ void sign_kernel(uint8_t* d_signatures, const uint8_t* d_pubs, const uint8_t* d_secrets, const uint8_t* d_messages, const size_t* d_message_lens, int num_messages) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= num_messages) return;
+    uint8_t* signature = d_signatures + idx * EDSIGN_SIGNATURE_SIZE;
+    const uint8_t* pub = d_pubs + idx * EDSIGN_PUBLIC_KEY_SIZE;
+    const uint8_t* secret = d_secrets + idx * EDSIGN_SECRET_KEY_SIZE;
+    const uint8_t* message = d_messages;
+    size_t len = d_message_lens[idx];
+    edsign_sign(signature, pub, secret, message, len);
+}
+void launch_sign_kernel(uint8_t* d_signatures, const uint8_t* d_pubs, const uint8_t* d_secrets, const uint8_t* d_messages, const size_t* d_message_lens, int num_messages) {
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (num_messages + threadsPerBlock - 1) / threadsPerBlock;
+    sign_kernel << <blocksPerGrid, threadsPerBlock >> > (d_signatures, d_pubs, d_secrets, d_messages, d_message_lens, num_messages);
+    cudaDeviceSynchronize();
+}
+#endif
+#endif
--- a/libs/f25519.cuh
+++ b/libs/f25519.cuh
@ -0,0 +1,211 @@
+#ifndef __F25519_CUH
+#define __F25519_CUH
+#include <stdint.h>
+#define F25519_SIZE 32
+__device__ __constant__ uint8_t f25519_zero[F25519_SIZE] = { 0 };
+__device__ __constant__ uint8_t f25519_one[F25519_SIZE] = { 1 };
+__device__ __forceinline__ void f25519_load(uint8_t* __restrict__ x, uint32_t c) {
+#pragma unroll
+    for (unsigned int i = 0; i < sizeof(c); i++) {
+        x[i] = c & 0xFF;
+        c >>= 8;
+    }
+#pragma unroll
+    for (unsigned int i = sizeof(c); i < F25519_SIZE; i++) {
+        x[i] = 0;
+    }
+}
+__device__ __forceinline__ void f25519_copy(uint8_t* __restrict__ x, const uint8_t* __restrict__ a) {
+#pragma unroll
+    for (int i = 0; i < F25519_SIZE; i++) {
+        x[i] = a[i];
+    }
+}
+__device__ __forceinline__ void f25519_select(uint8_t* __restrict__ dst,
+    const uint8_t* __restrict__ zero,
+    const uint8_t* __restrict__ one, uint8_t cond) {
+    const uint8_t mask = 0 - cond;
+#pragma unroll
+    for (int i = 0; i < F25519_SIZE; i++) {
+        dst[i] = zero[i] ^ (mask & (one[i] ^ zero[i]));
+    }
+}
+__device__ __forceinline__ void f25519_normalize(uint8_t* __restrict__ x) {
+    uint8_t minusp[F25519_SIZE];
+    uint16_t c = (x[31] >> 7) * 19;
+    x[31] &= 127;
+#pragma unroll
+    for (int i = 0; i < F25519_SIZE; i++) {
+        c += x[i];
+        x[i] = (uint8_t)c;
+        c >>= 8;
+    }
+    c = 19;
+#pragma unroll
+    for (int i = 0; i + 1 < F25519_SIZE; i++) {
+        c += x[i];
+        minusp[i] = (uint8_t)c;
+        c >>= 8;
+    }
+    c += x[F25519_SIZE - 1] - 128;
+    minusp[F25519_SIZE - 1] = (uint8_t)c;
+    f25519_select(x, minusp, x, (c >> 15) & 1);
+}
+__device__ __forceinline__ uint8_t f25519_eq(const uint8_t* __restrict__ x, const uint8_t* __restrict__ y) {
+    uint8_t s = 0;
+#pragma unroll
+    for (int i = 0; i < F25519_SIZE; i++)
+        s |= x[i] ^ y[i];
+    s |= s >> 4;
+    s |= s >> 2;
+    s |= s >> 1;
+    return (s ^ 1) & 1;
+}
+__device__ __forceinline__ void f25519_add(uint8_t* __restrict__ r,
+    const uint8_t* __restrict__ a,
+    const uint8_t* __restrict__ b) {
+    uint16_t c = 0;
+#pragma unroll
+    for (int i = 0; i < F25519_SIZE; i++) {
+        c = (c >> 8) + ((uint16_t)a[i]) + ((uint16_t)b[i]);
+        r[i] = (uint8_t)c;
+    }
+    r[F25519_SIZE - 1] &= 127;
+    c = (c >> 7) * 19;
+#pragma unroll
+    for (int i = 0; i < F25519_SIZE; i++) {
+        c += r[i];
+        r[i] = (uint8_t)c;
+        c >>= 8;
+    }
+}
+__device__ __forceinline__ void f25519_sub(uint8_t* __restrict__ r,
+    const uint8_t* __restrict__ a,
+    const uint8_t* __restrict__ b) {
+    uint32_t c = 218;
+    int i = 0;
+#pragma unroll
+    for (i = 0; i + 1 < F25519_SIZE; i++) {
+        c += 65280 + ((uint32_t)a[i]) - ((uint32_t)b[i]);
+        r[i] = (uint8_t)c;
+        c >>= 8;
+    }
+    c += ((uint32_t)a[i]) - ((uint32_t)b[i]);
+    r[i] = (uint8_t)(c & 127);
+    c = (c >> 7) * 19;
+#pragma unroll
+    for (i = 0; i < F25519_SIZE; i++) {
+        c += r[i];
+        r[i] = (uint8_t)c;
+        c >>= 8;
+    }
+}
+__device__ __forceinline__ void f25519_neg(uint8_t* __restrict__ r,
+    const uint8_t* __restrict__ a) {
+    uint32_t c = 218;
+    int i = 0;
+#pragma unroll
+    for (i = 0; i + 1 < F25519_SIZE; i++) {
+        c += 65280 - ((uint32_t)a[i]);
+        r[i] = (uint8_t)c;
+        c >>= 8;
+    }
+    c -= ((uint32_t)a[i]);
+    r[i] = (uint8_t)(c & 127);
+    c = (c >> 7) * 19;
+#pragma unroll
+    for (i = 0; i < F25519_SIZE; i++) {
+        c += r[i];
+        r[i] = (uint8_t)c;
+        c >>= 8;
+    }
+}
+__device__ __forceinline__ void f25519_mul__distinct(uint8_t* __restrict__ r,
+    const uint8_t* __restrict__ a,
+    const uint8_t* __restrict__ b) {
+    uint32_t c = 0;
+#pragma unroll
+    for (int i = 0; i < F25519_SIZE; i++) {
+        c >>= 8;
+        for (int j = 0; j <= i; j++) {
+            c += ((uint32_t)a[j]) * ((uint32_t)b[i - j]);
+        }
+        for (int j = i + 1; j < F25519_SIZE; j++) {
+            c += ((uint32_t)a[j]) * ((uint32_t)b[F25519_SIZE + i - j]) * 38;
+        }
+        r[i] = (uint8_t)c;
+    }
+    r[F25519_SIZE - 1] &= 127;
+    c = (c >> 7) * 19;
+#pragma unroll
+    for (int i = 0; i < F25519_SIZE; i++) {
+        c += r[i];
+        r[i] = (uint8_t)c;
+        c >>= 8;
+    }
+}
+__device__ __forceinline__ void f25519_mul_c(uint8_t* __restrict__ r,
+    const uint8_t* __restrict__ a, uint32_t b) {
+    uint32_t c = 0;
+#pragma unroll
+    for (int i = 0; i < F25519_SIZE; i++) {
+        c = (c >> 8) + b * ((uint32_t)a[i]);
+        r[i] = (uint8_t)c;
+    }
+    r[F25519_SIZE - 1] &= 127;
+    c = (c >> 7) * 19;
+#pragma unroll
+    for (int i = 0; i < F25519_SIZE; i++) {
+        c += r[i];
+        r[i] = (uint8_t)c;
+        c >>= 8;
+    }
+}
+__device__ __forceinline__ void f25519_inv__distinct(uint8_t* __restrict__ r,
+    const uint8_t* __restrict__ x) {
+    uint8_t s[F25519_SIZE];
+    f25519_mul__distinct(s, x, x);
+    f25519_mul__distinct(r, s, x);
+#pragma unroll
+    for (int i = 0; i < 248; i++) {
+        f25519_mul__distinct(s, r, r);
+        f25519_mul__distinct(r, s, x);
+    }
+    f25519_mul__distinct(s, r, r);
+    f25519_mul__distinct(r, s, s);
+    f25519_mul__distinct(s, r, x);
+    f25519_mul__distinct(r, s, s);
+    f25519_mul__distinct(s, r, r);
+    f25519_mul__distinct(r, s, x);
+    f25519_mul__distinct(s, r, r);
+    f25519_mul__distinct(r, s, x);
+}
+__device__ __forceinline__ void exp2523(uint8_t* __restrict__ r,
+    const uint8_t* __restrict__ x,
+    uint8_t* __restrict__ s) {
+    int i;
+    f25519_mul__distinct(r, x, x);
+    f25519_mul__distinct(s, r, x);
+#pragma unroll
+    for (i = 0; i < 248; i++) {
+        f25519_mul__distinct(r, s, s);
+        f25519_mul__distinct(s, r, x);
+    }
+    f25519_mul__distinct(r, s, s);
+    f25519_mul__distinct(s, r, r);
+    f25519_mul__distinct(r, s, x);
+}
+__device__ __forceinline__ void f25519_sqrt(uint8_t* __restrict__ r,
+    const uint8_t* __restrict__ a) {
+    uint8_t v[F25519_SIZE], i_val[F25519_SIZE], x[F25519_SIZE], y[F25519_SIZE];
+    f25519_mul_c(x, a, 2);
+    exp2523(v, x, y);
+    f25519_mul__distinct(y, v, v);
+    f25519_mul__distinct(i_val, x, y);
+    uint8_t one[F25519_SIZE];
+    f25519_load(one, 1);
+    f25519_sub(i_val, i_val, one);
+    f25519_mul__distinct(x, v, a);
+    f25519_mul__distinct(r, x, i_val);
+}
+#endif
--- a/libs/fprime.cuh
+++ b/libs/fprime.cuh
@ -0,0 +1,157 @@
+#ifndef __FPRIME_CUH
+#define __FPRIME_CUH
+#include <stdint.h>
+#include <stddef.h>
+#include <string.h>
+#ifndef COMPACT_DISABLE_ED25519
+#ifdef FULL_C25519_CODE
+__device__ const uint8_t fprime_zero[FPRIME_SIZE] = { 0 };
+__device__ const uint8_t fprime_one[FPRIME_SIZE] = { 1 };
+#endif
+#define FPRIME_SIZE  32
+__device__ void raw_add(uint8_t* x, const uint8_t* p) {
+    uint16_t c = 0;
+    for (int i = 0; i < FPRIME_SIZE; i++) {
+        c += ((uint16_t)x[i]) + ((uint16_t)p[i]);
+        x[i] = (uint8_t)c;
+        c >>= 8;
+    }
+}
+__device__ void fprime_select(uint8_t* dst, const uint8_t* zero, const uint8_t* one, uint8_t condition) {
+    const uint8_t mask = -condition;
+    for (int i = 0; i < FPRIME_SIZE; i++)
+        dst[i] = zero[i] ^ (mask & (one[i] ^ zero[i]));
+}
+__device__ void raw_try_sub(uint8_t* x, const uint8_t* p)
+{
+    uint8_t minusp[FPRIME_SIZE];
+    uint16_t c = 0;
+    for (int i = 0; i < FPRIME_SIZE; i++) {
+        c = ((uint16_t)x[i]) - ((uint16_t)p[i]) - c;
+        minusp[i] = (uint8_t)c;
+        c = (c >> 8) & 1;
+    }
+    fprime_select(x, minusp, x, c);
+}
+__device__ int prime_msb(const uint8_t* p) {
+    int i;
+    uint8_t x;
+    for (i = FPRIME_SIZE - 1; i >= 0; i--) {
+        if (p[i])
+            break;
+    }
+    x = p[i];
+    i <<= 3;
+    while (x) {
+        x >>= 1;
+        i++;
+    }
+    return i - 1;
+}
+__device__ void shift_n_bits(uint8_t* x, int n) {
+    uint16_t c = 0;
+    for (int i = 0; i < FPRIME_SIZE; i++) {
+        c |= ((uint16_t)x[i]) << n;
+        x[i] = (uint8_t)c;
+        c >>= 8;
+    }
+}
+#ifdef FULL_C25519_CODE
+__device__ void fprime_load(uint8_t* x, uint32_t c)
+{
+    unsigned int i;
+    for (i = 0; i < sizeof(c); i++) {
+        x[i] = (uint8_t)c;
+        c >>= 8;
+    }
+    for (; i < FPRIME_SIZE; i++)
+        x[i] = 0;
+}
+#endif
+__device__ inline int min_int(int a, int b) {
+    return a < b ? a : b;
+}
+__device__ void fprime_from_bytes(uint8_t* n, const uint8_t* x, size_t len, const uint8_t* modulus) {
+    const int preload_total = min_int(prime_msb(modulus) - 1, (int)(len << 3));
+    const int preload_bytes = preload_total >> 3;
+    const int preload_bits = preload_total & 7;
+    const int rbits = (len << 3) - preload_total;
+    memset(n, 0, FPRIME_SIZE);
+    for (int i = 0; i < preload_bytes; i++)
+        n[i] = x[len - preload_bytes + i];
+    if (preload_bits) {
+        shift_n_bits(n, preload_bits);
+        n[0] |= x[len - preload_bytes - 1] >> (8 - preload_bits);
+    }
+    for (int i = rbits - 1; i >= 0; i--) {
+        const uint8_t bit = (x[i >> 3] >> (i & 7)) & 1;
+        shift_n_bits(n, 1);
+        n[0] |= bit;
+        raw_try_sub(n, modulus);
+    }
+}
+#ifdef FULL_C25519_CODE
+__device__ void fprime_normalize(uint8_t* x, const uint8_t* modulus) {
+    uint8_t n[FPRIME_SIZE];
+    fprime_from_bytes(n, x, FPRIME_SIZE, modulus);
+    fprime_copy(x, n);
+}
+__device__ uint8_t fprime_eq(const uint8_t* x, const uint8_t* y) {
+    uint8_t sum = 0;
+    for (int i = 0; i < FPRIME_SIZE; i++)
+        sum |= x[i] ^ y[i];
+    sum |= (sum >> 4);
+    sum |= (sum >> 2);
+    sum |= (sum >> 1);
+    return (sum ^ 1) & 1;
+}
+#endif
+__device__ void fprime_add(uint8_t* r, const uint8_t* a, const uint8_t* modulus) {
+    raw_add(r, a);
+    raw_try_sub(r, modulus);
+}
+#ifdef FULL_C25519_CODE
+__device__ void fprime_sub(uint8_t* r, const uint8_t* a, const uint8_t* modulus) {
+    raw_add(r, modulus);
+    raw_try_sub(r, a);
+    raw_try_sub(r, modulus);
+}
+#endif
+__device__ inline void fprime_copy(uint8_t* x, const uint8_t* a) {
+    memcpy(x, a, FPRIME_SIZE);
+}
+__device__ void fprime_mul(uint8_t* r, const uint8_t* a, const uint8_t* b, const uint8_t* modulus) {
+    memset(r, 0, FPRIME_SIZE);
+    for (int i = prime_msb(modulus); i >= 0; i--) {
+        const uint8_t bit = (b[i >> 3] >> (i & 7)) & 1;
+        uint8_t plusa[FPRIME_SIZE];
+        shift_n_bits(r, 1);
+        raw_try_sub(r, modulus);
+        fprime_copy(plusa, r);
+        fprime_add(plusa, a, modulus);
+        fprime_select(r, r, plusa, bit);
+    }
+}
+#ifdef FULL_C25519_CODE
+__device__ void fprime_inv(uint8_t* r, const uint8_t* a, const uint8_t* modulus) {
+    uint8_t pm2[FPRIME_SIZE];
+    uint16_t c = 2;
+    fprime_copy(pm2, modulus);
+    for (int i = 0; i < FPRIME_SIZE; i++) {
+        c = modulus[i] - c;
+        pm2[i] = (uint8_t)c;
+        c >>= 8;
+    }
+    fprime_load(r, 1);
+    for (int i = prime_msb(modulus); i >= 0; i--) {
+        uint8_t r2[FPRIME_SIZE];
+        fprime_mul(r2, r, r, modulus);
+        if ((pm2[i >> 3] >> (i & 7)) & 1)
+            fprime_mul(r, r2, a, modulus);
+        else
+            fprime_copy(r, r2);
+    }
+}
+#endif
+#endif
+#endif
--- a/libs/sha512.cuh
+++ b/libs/sha512.cuh
@ -0,0 +1,175 @@
+#ifndef __SHA512_CUH
+#define __SHA512_CUH
+#include <stdint.h>
+#include <stddef.h>
+#include <string.h>
+#define SHA512_BLOCK_SIZE 128
+struct sha512_state {
+    uint64_t h[8];
+};
+#if !defined(COMPACT_DISABLE_ED25519) || !defined(COMPACT_DISABLE_X25519_DERIVE)
+__device__ __constant__ sha512_state sha512_initial_state = { {
+    0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
+    0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
+    0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
+    0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL,
+} };
+#endif
+__device__ __constant__ uint64_t round_k[80] = {
+    0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL,
+    0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL,
+    0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
+    0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL,
+    0xd807aa98a3030242ULL, 0x12835b0145706fbeULL,
+    0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
+    0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL,
+    0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL,
+    0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
+    0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL,
+    0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL,
+    0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
+    0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL,
+    0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL,
+    0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
+    0x06ca6351e003826fULL, 0x142929670a0e6e70ULL,
+    0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL,
+    0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
+    0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL,
+    0x81c2c92e47edaee6ULL, 0x92722c851482353bULL,
+    0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
+    0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL,
+    0xd192e819d6ef5218ULL, 0xd69906245565a910ULL,
+    0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
+    0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL,
+    0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL,
+    0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
+    0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL,
+    0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL,
+    0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
+    0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL,
+    0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL,
+    0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
+    0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL,
+    0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL,
+    0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
+    0x28db77f523047d84ULL, 0x32caab7b40c72493ULL,
+    0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL,
+    0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
+    0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL,
+};
+__device__ __forceinline__ uint64_t load64(const uint8_t* x) {
+    return ((uint64_t)x[0] << 56) | ((uint64_t)x[1] << 48) |
+        ((uint64_t)x[2] << 40) | ((uint64_t)x[3] << 32) |
+        ((uint64_t)x[4] << 24) | ((uint64_t)x[5] << 16) |
+        ((uint64_t)x[6] << 8) | ((uint64_t)x[7]);
+}
+__device__ __forceinline__ void store64(uint8_t* x, uint64_t v) {
+    x[0] = (uint8_t)(v >> 56);
+    x[1] = (uint8_t)(v >> 48);
+    x[2] = (uint8_t)(v >> 40);
+    x[3] = (uint8_t)(v >> 32);
+    x[4] = (uint8_t)(v >> 24);
+    x[5] = (uint8_t)(v >> 16);
+    x[6] = (uint8_t)(v >> 8);
+    x[7] = (uint8_t)(v);
+}
+__device__ __forceinline__ uint64_t rot64(uint64_t x, int bits) {
+    return (x >> bits) | (x << (64 - bits));
+}
+__device__ void sha512_block(sha512_state* s, const uint8_t* blk) {
+    uint64_t w[16];
+#pragma unroll
+    for (int i = 0; i < 16; i++) {
+        w[i] = load64(blk + i * 8);
+    }
+    uint64_t a = s->h[0];
+    uint64_t b = s->h[1];
+    uint64_t c = s->h[2];
+    uint64_t d = s->h[3];
+    uint64_t e = s->h[4];
+    uint64_t f = s->h[5];
+    uint64_t g = s->h[6];
+    uint64_t h = s->h[7];
+#pragma unroll
+    for (int i = 0; i < 80; i++) {
+        const int idx = i & 15;
+        const int idx1 = (i + 1) & 15;
+        const int idx7 = (i + 9) & 15;
+        const int idx14 = (i + 14) & 15;
+        uint64_t s0 = rot64(w[idx1], 1) ^ rot64(w[idx1], 8) ^ (w[idx1] >> 7);
+        uint64_t s1 = rot64(w[idx14], 19) ^ rot64(w[idx14], 61) ^ (w[idx14] >> 6);
+        uint64_t S0 = rot64(a, 28) ^ rot64(a, 34) ^ rot64(a, 39);
+        uint64_t S1 = rot64(e, 14) ^ rot64(e, 18) ^ rot64(e, 41);
+        uint64_t ch = (e & f) ^ ((~e) & g);
+        uint64_t temp1 = h + S1 + ch + round_k[i] + w[idx];
+        uint64_t maj = (a & b) ^ (a & c) ^ (b & c);
+        uint64_t temp2 = S0 + maj;
+        h = g;
+        g = f;
+        f = e;
+        e = d + temp1;
+        d = c;
+        c = b;
+        b = a;
+        a = temp1 + temp2;
+
+        w[idx] += s0 + w[idx7] + s1;
+    }
+    s->h[0] += a;
+    s->h[1] += b;
+    s->h[2] += c;
+    s->h[3] += d;
+    s->h[4] += e;
+    s->h[5] += f;
+    s->h[6] += g;
+    s->h[7] += h;
+}
+__device__ void sha512_final(sha512_state* s, const uint8_t* blk, size_t total_size) {
+    uint8_t temp[SHA512_BLOCK_SIZE];
+    memset(temp, 0, sizeof(temp));
+
+    size_t last_size = total_size & (SHA512_BLOCK_SIZE - 1);
+    if (last_size) {
+        memcpy(temp, blk, last_size);
+    }
+    temp[last_size] = 0x80;
+    if (last_size > (SHA512_BLOCK_SIZE - 9)) {
+        sha512_block(s, temp);
+        memset(temp, 0, sizeof(temp));
+    }
+    store64(temp + SHA512_BLOCK_SIZE - 8, total_size << 3);
+    sha512_block(s, temp);
+}
+__device__ void sha512_get(const sha512_state* s, uint8_t* hash, unsigned int offset, unsigned int len) {
+    if (offset > SHA512_BLOCK_SIZE)
+        return;
+    if (len > SHA512_BLOCK_SIZE - offset)
+        len = SHA512_BLOCK_SIZE - offset;
+    unsigned int i = offset >> 3;
+    unsigned int off = offset & 7;
+    if (off) {
+        uint8_t tmp[8];
+        store64(tmp, s->h[i]);
+        unsigned int c = 8 - off;
+        if (c > len) c = len;
+        memcpy(hash, tmp + off, c);
+        len -= c;
+        hash += c;
+        i++;
+    }
+    while (len >= 8) {
+        store64(hash, s->h[i]);
+        hash += 8;
+        len -= 8;
+        i++;
+    }
+    if (len) {
+        uint8_t tmp[8];
+        store64(tmp, s->h[i]);
+        memcpy(hash, tmp, len);
+    }
+}
+__device__ void sha512_init(struct sha512_state* s) {
+    memcpy(s, &sha512_initial_state, sizeof(*s));
+}
+#endif
--- a/sources/main.cpp
+++ b/sources/main.cpp
@ -13,7 +13,6 @@
 struct option {
    unsigned proc = 0;
    std::atomic<unsigned> high = 0x14;
-    //std::string outputfile;
 };
 static option conf;
 int parameters(std::string arg) {
@ -38,8 +37,9 @@ int parameters(std::string arg) {
        }
        */
    }
-    if (arg == "--threads" || arg == "-t") return 777;
-    else if (arg == "--altitude" || arg == "-a") return 777;
+    if (arg == "--threads" || arg == "-t" || arg == "--altitude" || arg == "-a") {
+        return 777;
+    }
    return 0;
 }
 void displayConfig() {
@ -110,10 +110,7 @@ inline void invertKey(const unsigned char* __restrict key, Key& inverted) noexce
 }
 inline void rmbytes(unsigned char* __restrict buf, unsigned char size, unsigned long& state) noexcept {
    for (unsigned char x = 0; x < size / 32; x++) {
-        _mm256_storeu_si256((__m256i*) & buf[x * 32], _mm256_set_epi64x(xorshift64(state), xorshift64(state), xorshift64(state), xorshift64(state)));
-    }
-    for (unsigned char x = 0; x < (size % 32); x++) {
-        buf[(size / 32) * 32 + x] = static_cast<unsigned char>(xorshift64(state) & 0xFF);
+        _mm256_store_si256((__m256i*) & buf[x * 32], _mm256_set_epi64x(xorshift64(state), xorshift64(state), xorshift64(state), xorshift64(state)));
    }
 }
 inline void sign_keypair(unsigned char* __restrict pk, unsigned char* __restrict sk, const unsigned char* __restrict seed) noexcept {
@ -121,8 +118,8 @@ inline void sign_keypair(unsigned char* __restrict pk, unsigned char* __restrict
    crypto_hash_sha512(h, seed, 32);
    h[31] = (h[31] & 0xF8) | (0x40 | (h[31] & 0x7F));
    crypto_scalarmult_ed25519_base(pk, h);
-    _mm256_storeu_si256(reinterpret_cast<__m256i*>(sk), _mm256_loadu_si256(reinterpret_cast<const __m256i*>(seed)));
-    _mm256_storeu_si256(reinterpret_cast<__m256i*>(sk + 32), _mm256_loadu_si256(reinterpret_cast<const __m256i*>(pk)));
+    _mm256_store_si256(reinterpret_cast<__m256i*>(sk), _mm256_load_si256(reinterpret_cast<const __m256i*>(seed)));
+    _mm256_store_si256(reinterpret_cast<__m256i*>(sk + 32), _mm256_load_si256(reinterpret_cast<const __m256i*>(pk)));
 }
 void miner_thread() noexcept {
    alignas(32) Key inv;
@ -135,10 +132,9 @@ void miner_thread() noexcept {
    printf("Using seed: %lu\n", state);
    while (true) {
        rmbytes(seed, sizeof(seed), state);
-        sign_keypair(keys.PublicKey, keys.PrivateKey, seed);
-        //crypto_sign_ed25519_seed_keypair(keys.PublicKey, keys.PrivateKey, seed);
-        ones = getZeros(keys.PublicKey);
-        if (ones > conf.high.load()) {
+        //sign_keypair(keys.PublicKey, keys.PrivateKey, seed);
+        crypto_sign_ed25519_seed_keypair(keys.PublicKey, keys.PrivateKey, seed);
+        if (ones = getZeros(keys.PublicKey); ones > conf.high.load()) {
            conf.high.store(ones);
            invertKey(keys.PublicKey, inv);
            getRawAddress(ones, inv, rawAddr);
@ -157,22 +153,16 @@ void startThreads() noexcept {
    }
 }
 int main(int argc, char* argv[]) noexcept {
-    if (argc >= 2) {
-        int res = -1;
-        for (int i = 1;; ++i) {
-            if (argv[i] == nullptr) break;
-            res = parameters(std::string(argv[i]));
-            if (res == 777) {
-                i++;
-                if (argv[i] == nullptr) {
-                    std::cerr << " Empty value for parameter \"" << argv[i - 1] << "\"" << std::endl;
-                    return 776;
-                }
-                int res2 = parameters(std::string(std::string(argv[i - 1]) + " " + std::string(argv[i])));
-                if (res2 != 0) {
-                    std::cerr << " Wrong value \"" << argv[i] << "\" for parameter \"" << argv[i - 1] << "\"" << std::endl;
-                    return res;
-                }
+    if (argc < 2) return 0;
+    for (int x = 1; x < argc; x++) {
+        if (int res = parameters(argv[x]); res == 777) {
+            if (++x >= argc) {
+                std::cerr << "Empty value for parameter \"" << argv[x - 1] << "\"" << std::endl;
+                return 776;
+            }
+            if (parameters(argv[x - 1] + std::string(" ") + argv[x]) != 0) {
+                std::cerr << "Wrong value \"" << argv[x] << "\" for parameter \"" << argv[x - 1] << "\"" << std::endl;
+                return res;
            }
        }
    }
--- a/sources/main.cu
+++ b/sources/main.cu
@ -0,0 +1,199 @@
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cstdint>
+#include <cuda_runtime.h>
+#include <curand_kernel.h>
+#include <arpa/inet.h>
+#include "../libs/sha512.cuh"
+#include "../libs/ed25519.cuh"
+#include "../libs/edsign.cuh"
+#define MAX_RESULTS 1024
+__constant__ char hexDigitsConst[17] = "0123456789abcdef";
+using Address = unsigned char[16];
+using Key = unsigned char[32];
+
+struct KeysBox {
+    Key PublicKey;
+    Key PrivateKey;
+};
+struct option {
+    unsigned proc = 0;
+    __device__ __managed__ unsigned high = 0x10;
+};
+__device__ static option conf;
+struct ds64 {
+    char data[65];
+};
+struct ds46 {
+    char data[46];
+};
+__host__ __device__ ds64 KeyToString(const unsigned char* key) noexcept {
+    ds64 str;
+#ifdef __CUDA_ARCH__
+    const char* hexDigits = hexDigitsConst;
+#else
+    const char* hexDigits = "0123456789abcdef";
+#endif
+    for (unsigned char i = 0; i < 32; i++) {
+        str.data[2 * i] = hexDigits[key[i] >> 4];
+        str.data[2 * i + 1] = hexDigits[key[i] & 0x0F];
+    }
+    str.data[64] = '\0';
+    return str;
+}
+__device__ ds46 getAddress(const unsigned char rawAddr[16]) noexcept {
+    ds46 addrStr;
+#ifdef __CUDA_ARCH__
+    const char* hexDigits = hexDigitsConst;
+#else
+    const char* hexDigits = "0123456789abcdef";
+#endif
+    int pos = 0;
+    for (int group = 0; group < 8; group++) {
+        int idx = group * 2;
+        addrStr.data[pos++] = hexDigits[rawAddr[idx] >> 4];
+        addrStr.data[pos++] = hexDigits[rawAddr[idx] & 0x0F];
+        addrStr.data[pos++] = hexDigits[rawAddr[idx + 1] >> 4];
+        addrStr.data[pos++] = hexDigits[rawAddr[idx + 1] & 0x0F];
+        if (group < 7) {
+            addrStr.data[pos++] = ':';
+        }
+    }
+    addrStr.data[pos] = '\0';
+    return addrStr;
+}
+__device__ void getRawAddress(int lErase, Key& InvertedPublicKey, Address& rawAddr) noexcept {
+    lErase++;
+    const int bitsToShift = lErase % 8;
+    const int start = lErase / 8;
+    if (bitsToShift != 0) {
+        for (int i = start; i < start + 15; i++) {
+            InvertedPublicKey[i] = static_cast<unsigned char>(
+                (InvertedPublicKey[i] << bitsToShift) |
+                (InvertedPublicKey[i + 1] >> (8 - bitsToShift))
+                );
+        }
+    }
+    rawAddr[0] = 0x02;
+    rawAddr[1] = static_cast<unsigned char>(lErase - 1);
+    memcpy(&rawAddr[2], &InvertedPublicKey[start], 14);
+}
+__device__ unsigned long long xorshift128plus(unsigned long long* state) {
+    unsigned long long x = state[0];
+    const unsigned long long y = state[1];
+    state[0] = y;
+    x ^= x << 23;
+    x ^= x >> 17;
+    x ^= y ^ (y >> 26);
+    state[1] = x;
+    return x + y;
+}
+__device__ unsigned char zeroCounter(unsigned int x) {
+    if (x == 0)
+        return 32;
+#ifdef __CUDA_ARCH__
+    return static_cast<unsigned char>(__clz(x));
+#else
+    return static_cast<unsigned char>(__builtin_clz(x));
+#endif
+}
+__device__ unsigned char getZeros(const unsigned char* v) {
+    unsigned char leadZeros = 0;
+    for (int i = 0; i < 32; i += 4) {
+        unsigned int word = (static_cast<unsigned int>(v[i]) << 24) |
+            (static_cast<unsigned int>(v[i + 1]) << 16) |
+            (static_cast<unsigned int>(v[i + 2]) << 8) |
+            (static_cast<unsigned int>(v[i + 3]));
+        if (word == 0)
+            leadZeros += 32;
+        else {
+            leadZeros += zeroCounter(word);
+            break;
+        }
+    }
+    return leadZeros;
+}
+__global__ void initRandStates(curandState* randStates) {
+    int id = blockIdx.x * blockDim.x + threadIdx.x;
+    curand_init((unsigned long long)clock64() + id, id, 0, &randStates[id]);
+}
+__device__ void generateRandomBytes(uint8_t* buf, size_t size, curandState* state) {
+    for (size_t i = 0; i < size; i++) {
+        buf[i] = curand(state) & 0xFF;
+    }
+}
+__device__ void invertKey(const unsigned char* key, unsigned char* inverted) {
+    for (int i = 0; i < 32; i++)
+        inverted[i] = key[i] ^ 0xFF;
+}
+__device__ void compact_wipe(void* data, size_t length) {
+    volatile uint8_t* p = (volatile uint8_t*)data;
+    while (length--) {
+        *p++ = 0;
+    }
+}
+__device__ void ed25519_keygen(uint8_t private_key[64], uint8_t public_key[32], uint8_t random_seed[32]) {
+    edsign_sec_to_pub(public_key, random_seed);
+    memcpy(private_key, random_seed, 32);
+    memcpy(private_key + 32, public_key, 32);
+    compact_wipe(random_seed, 32);
+}
+struct Result {
+    char ipv6[46];
+    char pk[65];
+    char sk[65];
+};
+__device__ __managed__ Result resultBuffer[MAX_RESULTS];
+__device__ __managed__ int resultCount = 0;
+__global__ __launch_bounds__(256) void minerKernel(curandState* randStates) {
+    int tid_global = blockIdx.x * blockDim.x + threadIdx.x;
+    curandState localState = randStates[tid_global];
+    uint8_t seed[32];
+    generateRandomBytes(seed, sizeof(seed), &localState);
+    if (tid_global == 0) {
+        printf("Seed: %s\n", KeyToString(seed).data);
+    }
+    while (true) {
+        generateRandomBytes(seed, sizeof(seed), &localState);
+        KeysBox keys;
+        ed25519_keygen(keys.PrivateKey, keys.PublicKey, seed);
+        int zeros = getZeros(keys.PublicKey);
+        unsigned oldHigh = atomicMax(&conf.high, (unsigned)zeros);
+        if (zeros > oldHigh) {
+            Key inv;
+            Address rawAddr_local;
+            invertKey(keys.PublicKey, inv);
+            getRawAddress(zeros, inv, rawAddr_local);
+            ds46 addrStr = getAddress(rawAddr_local);
+            ds64 pkStr = KeyToString(keys.PublicKey);
+            ds64 skStr = KeyToString(keys.PrivateKey);
+            int idx = atomicAdd(&resultCount, 1);
+            if (idx < MAX_RESULTS) {
+                memcpy(resultBuffer[idx].ipv6, addrStr.data, sizeof(addrStr.data));
+                memcpy(resultBuffer[idx].pk, pkStr.data, sizeof(pkStr.data));
+                memcpy(resultBuffer[idx].sk, skStr.data, sizeof(skStr.data));
+            }
+        }
+        if (tid_global == 0) {
+            if (resultCount > 0) {
+                for (int i = 0; i < resultCount; i++) {
+                    printf("\nIPv6:\t%s\nPK:\t%s\nSK:\t%s\n", resultBuffer[i].ipv6, resultBuffer[i].pk, resultBuffer[i].sk);
+                }
+                resultCount = 0;
+            }
+        }
+        __syncthreads();
+    }
+    randStates[tid_global] = localState;
+}
+int main() {
+    curandState* d_randStates;
+    cudaMalloc(&d_randStates, 1024 * sizeof(curandState));
+    initRandStates << <4, 256 >> > (d_randStates);
+    cudaDeviceSynchronize();
+    minerKernel << <4, 256 >> > (d_randStates);
+    cudaDeviceSynchronize();
+    cudaFree(d_randStates);
+    return 0;
+}