This commit is contained in:
rcxpony 2025-03-14 22:52:59 +05:00
parent 701139d0bd
commit b778652ef8
14 changed files with 605 additions and 542 deletions

View File

@ -1,11 +1,17 @@
# yggm # yggm
### Yggdrasil address miner ### Yggdrasil address miner
# How to build # How to build for CPU
```sh ```sh
git clone https://rcxpony.name/rcxpony/yggm.git && cd yggm git clone https://rcxpony.name/rcxpony/yggm.git && cd yggm
cmake -B build && cmake --build build -j$(nproc) cmake -B build && cmake --build build -j$(nproc)
build/yggm -t 10 build/yggm -t 10
``` ```
# How to build for GPU
```sh
git clone https://rcxpony.name/rcxpony/yggm.git && cd yggm/build
make -j$(nproc)
./yggmcu -t 10
```
# ToDo # ToDo
- [x] Cuda support (not optimized) - [x] Cuda support (not optimized)
- [x] Support for avx2 - [x] Support for avx2

View File

@ -1,17 +1,24 @@
NVCC := nvcc NVCC := nvcc
NVCC_FLAGS := -O3 -use_fast_math -Xptxas -O3 -gencode arch=compute_75,code=sm_75 --default-stream per-thread -Wno-deprecated-gpu-targets --expt-relaxed-constexpr -I../libs/ NVCC_FLAGS := -rdc=true -O3 -use_fast_math -Xptxas -O3 \
MAIN_SOURCE := ../sources/main.cu -gencode arch=compute_75,code=sm_75 \
LIBS_DIR := ../libs/ --default-stream per-thread \
BUILD_DIR := ../build -Wno-deprecated-gpu-targets \
--expt-relaxed-constexpr \
-I../libs/ \
-std=c++20
MAIN_SOURCE := ../sources/main.cu
LIBS_DIR := ../libs/
BUILD_DIR := ../build
LIBS_SOURCES := $(wildcard $(LIBS_DIR)*.cu) LIBS_SOURCES := $(wildcard $(LIBS_DIR)*.cu)
LIBS_OBJECTS := $(patsubst $(LIBS_DIR)%.cu,$(BUILD_DIR)/%.o,$(LIBS_SOURCES)) LIBS_OBJECTS := $(patsubst $(LIBS_DIR)%.cu,$(BUILD_DIR)/%.o,$(LIBS_SOURCES))
TARGET := main TARGET := yggmcu
.PHONY: all clean
all: $(TARGET) all: $(TARGET)
$(BUILD_DIR): $(BUILD_DIR):
mkdir -p $(BUILD_DIR) @mkdir -p $(BUILD_DIR)
$(TARGET): $(MAIN_SOURCE) $(LIBS_OBJECTS) $(TARGET): $(MAIN_SOURCE) $(LIBS_OBJECTS)
$(NVCC) $(NVCC_FLAGS) -o $(TARGET) $(MAIN_SOURCE) $(LIBS_OBJECTS) $(NVCC) $(NVCC_FLAGS) -o $@ $^
$(BUILD_DIR)/%.o: $(LIBS_DIR)%.cu | $(BUILD_DIR) $(BUILD_DIR)/%.o: $(LIBS_DIR)%.cu | $(BUILD_DIR)
$(NVCC) $(NVCC_FLAGS) -c $< -o $@ $(NVCC) $(NVCC_FLAGS) -c $< -o $@
clean: clean:
rm $(BUILD_DIR)/main @rm -f $(BUILD_DIR)/*.o $(TARGET)

113
libs/ed25519.cu Normal file
View File

@ -0,0 +1,113 @@
#include <ed25519.cuh>
#include <f25519.cuh>
__device__ __constant__ struct ed25519_pt ed25519_base = {
{0x1a,0xd5,0x25,0x8f,0x60,0x2d,0x56,0xc9,0xb2,0xa7,0x25,0x95,0x60,0xc7,0x2c,0x69,
0x5c,0xdc,0xd6,0xfd,0x31,0xe2,0xa4,0xc0,0xfe,0x53,0x6e,0xcd,0xd3,0x36,0x69,0x21},
{0x58,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,
0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66},
{0xa3,0xdd,0xb7,0xa5,0xb3,0x8a,0xde,0x6d,0xf5,0x52,0x51,0x77,0x80,0x9f,0xf0,0x20,
0x7d,0xe3,0xab,0x64,0x8e,0x4e,0xea,0x66,0x65,0x76,0x8b,0xd7,0x0f,0x5f,0x87,0x67},
{1,0}
};
__device__ __constant__ struct ed25519_pt ed25519_neutral = {
{0}, {1,0}, {0}, {1,0}
};
__device__ __constant__ unsigned char ed25519_d[F25519_SIZE] = {
0xa3,0x78,0x59,0x13,0xca,0x4d,0xeb,0x75,0xab,0xd8,0x41,0x41,0x4d,0x0a,0x70,0x00,
0x98,0xe8,0x79,0x77,0x79,0x40,0xc7,0x8c,0x73,0xfe,0x6f,0x2b,0xee,0x6c,0x03,0x52
};
__device__ __constant__ unsigned char ed25519_k[F25519_SIZE] = {
0x59,0xf1,0xb2,0x26,0x94,0x9b,0xd6,0xeb,0x56,0xb1,0x83,0x82,0x9a,0x14,0xe0,0x00,
0x30,0xd1,0xf3,0xee,0xf2,0x80,0x8e,0x19,0xe7,0xfc,0xdf,0x56,0xdc,0xd9,0x06,0x24
};
__device__ __forceinline__ void ed25519_project(struct ed25519_pt* p, const unsigned char* x, const unsigned char* y) {
f25519_copy(p->x, x);
f25519_copy(p->y, y);
f25519_load(p->z, 1);
f25519_mul__distinct(p->t, x, y);
}
__device__ void ed25519_unproject(unsigned char* x, unsigned char* y, const struct ed25519_pt* p) {
unsigned char z1[F25519_SIZE];
f25519_inv__distinct(z1, p->z);
f25519_mul__distinct(x, p->x, z1);
f25519_mul__distinct(y, p->y, z1);
f25519_normalize(x);
f25519_normalize(y);
}
__device__ void ed25519_pack(unsigned char* c, const unsigned char* x, const unsigned char* y) {
unsigned char tmp[F25519_SIZE];
unsigned char parity;
f25519_copy(tmp, x);
f25519_normalize(tmp);
parity = (tmp[0] & 1) << 7;
f25519_copy(c, y);
f25519_normalize(c);
c[31] |= parity;
}
__device__ __forceinline__ void ed25519_add(struct ed25519_pt* r, const struct ed25519_pt* p1, const struct ed25519_pt* p2) {
unsigned char a[F25519_SIZE], b[F25519_SIZE], c[F25519_SIZE], d[F25519_SIZE];
unsigned char e[F25519_SIZE], f[F25519_SIZE], g[F25519_SIZE], h[F25519_SIZE];
f25519_sub(c, p1->y, p1->x);
f25519_sub(d, p2->y, p2->x);
f25519_mul__distinct(a, c, d);
f25519_add(c, p1->y, p1->x);
f25519_add(d, p2->y, p2->x);
f25519_mul__distinct(b, c, d);
f25519_mul__distinct(d, p1->t, p2->t);
f25519_mul__distinct(c, d, ed25519_k);
f25519_mul__distinct(d, p1->z, p2->z);
f25519_add(d, d, d);
f25519_sub(e, b, a);
f25519_sub(f, d, c);
f25519_add(g, d, c);
f25519_add(h, b, a);
f25519_mul__distinct(r->x, e, f);
f25519_mul__distinct(r->y, g, h);
f25519_mul__distinct(r->t, e, h);
f25519_mul__distinct(r->z, f, g);
}
__device__ __forceinline__ void ed25519_double(struct ed25519_pt* r, const struct ed25519_pt* p) {
unsigned char a[F25519_SIZE], b[F25519_SIZE], c[F25519_SIZE];
unsigned char e[F25519_SIZE], f[F25519_SIZE], g[F25519_SIZE], h[F25519_SIZE];
f25519_mul__distinct(a, p->x, p->x);
f25519_mul__distinct(b, p->y, p->y);
f25519_mul__distinct(c, p->z, p->z);
f25519_add(c, c, c);
f25519_add(f, p->x, p->y);
f25519_mul__distinct(e, f, f);
f25519_sub(e, e, a);
f25519_sub(e, e, b);
f25519_sub(g, b, a);
f25519_sub(f, g, c);
f25519_neg(h, b);
f25519_sub(h, h, a);
f25519_mul__distinct(r->x, e, f);
f25519_mul__distinct(r->y, g, h);
f25519_mul__distinct(r->t, e, h);
f25519_mul__distinct(r->z, f, g);
}
__device__ __forceinline__ void ed25519_copy(struct ed25519_pt* dst, const struct ed25519_pt* src) {
f25519_copy(dst->x, src->x);
f25519_copy(dst->y, src->y);
f25519_copy(dst->t, src->t);
f25519_copy(dst->z, src->z);
}
__device__ void ed25519_smult(struct ed25519_pt* r_out, const struct ed25519_pt* p, const unsigned char* e) {
struct ed25519_pt r = ed25519_neutral;
for (int i = 255; i >= 0; i--) {
struct ed25519_pt s;
ed25519_double(&r, &r);
ed25519_add(&s, &r, p);
unsigned char bit = (e[i >> 3] >> (i & 7)) & 1;
f25519_select(r.x, r.x, s.x, bit);
f25519_select(r.y, r.y, s.y, bit);
f25519_select(r.z, r.z, s.z, bit);
f25519_select(r.t, r.t, s.t, bit);
}
ed25519_copy(r_out, &r);
}
__device__ void ed25519_prepare(unsigned char* e) {
e[0] &= 0xf8;
e[31] &= 0x7f;
e[31] |= 0x40;
}

View File

@ -1,117 +1,17 @@
#pragma once #ifndef __ED25519_CUH
#include <f25519.cuh> #define __ED25519_CUH
#define F25519_SIZE 32 #define F25519_SIZE 32
struct ed25519_pt { struct ed25519_pt { unsigned char x[F25519_SIZE], y[F25519_SIZE], t[F25519_SIZE], z[F25519_SIZE]; };
unsigned char x[F25519_SIZE], y[F25519_SIZE], t[F25519_SIZE], z[F25519_SIZE]; extern __device__ __constant__ struct ed25519_pt ed25519_base;
}; extern __device__ __constant__ struct ed25519_pt ed25519_neutral;
__device__ __constant__ struct ed25519_pt ed25519_base = { extern __device__ __constant__ unsigned char ed25519_d[F25519_SIZE];
{0x1a,0xd5,0x25,0x8f,0x60,0x2d,0x56,0xc9,0xb2,0xa7,0x25,0x95,0x60,0xc7,0x2c,0x69, extern __device__ __constant__ unsigned char ed25519_k[F25519_SIZE];
0x5c,0xdc,0xd6,0xfd,0x31,0xe2,0xa4,0xc0,0xfe,0x53,0x6e,0xcd,0xd3,0x36,0x69,0x21}, __device__ __forceinline__ void ed25519_project(struct ed25519_pt* p, const unsigned char* x, const unsigned char* y);
{0x58,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66, __device__ void ed25519_unproject(unsigned char* x, unsigned char* y, const struct ed25519_pt* p);
0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66}, __device__ void ed25519_pack(unsigned char* c, const unsigned char* x, const unsigned char* y);
{0xa3,0xdd,0xb7,0xa5,0xb3,0x8a,0xde,0x6d,0xf5,0x52,0x51,0x77,0x80,0x9f,0xf0,0x20, __device__ __forceinline__ void ed25519_add(struct ed25519_pt* r, const struct ed25519_pt* p1, const struct ed25519_pt* p2);
0x7d,0xe3,0xab,0x64,0x8e,0x4e,0xea,0x66,0x65,0x76,0x8b,0xd7,0x0f,0x5f,0x87,0x67}, __device__ __forceinline__ void ed25519_double(struct ed25519_pt* r, const struct ed25519_pt* p);
{1,0} __device__ __forceinline__ void ed25519_copy(struct ed25519_pt* dst, const struct ed25519_pt* src);
}; __device__ void ed25519_smult(struct ed25519_pt* r_out, const struct ed25519_pt* p, const unsigned char* e);
__device__ __constant__ struct ed25519_pt ed25519_neutral = { __device__ void ed25519_prepare(unsigned char* e);
{0}, {1,0}, {0}, {1,0} #endif
};
__device__ __constant__ unsigned char ed25519_d[F25519_SIZE] = {
0xa3,0x78,0x59,0x13,0xca,0x4d,0xeb,0x75,0xab,0xd8,0x41,0x41,0x4d,0x0a,0x70,0x00,
0x98,0xe8,0x79,0x77,0x79,0x40,0xc7,0x8c,0x73,0xfe,0x6f,0x2b,0xee,0x6c,0x03,0x52
};
__device__ __constant__ unsigned char ed25519_k[F25519_SIZE] = {
0x59,0xf1,0xb2,0x26,0x94,0x9b,0xd6,0xeb,0x56,0xb1,0x83,0x82,0x9a,0x14,0xe0,0x00,
0x30,0xd1,0xf3,0xee,0xf2,0x80,0x8e,0x19,0xe7,0xfc,0xdf,0x56,0xdc,0xd9,0x06,0x24
};
__device__ __forceinline__ void ed25519_project(struct ed25519_pt* p, const unsigned char* x, const unsigned char* y) {
f25519_copy(p->x, x);
f25519_copy(p->y, y);
f25519_load(p->z, 1);
f25519_mul__distinct(p->t, x, y);
}
__device__ __forceinline__ void ed25519_unproject(unsigned char* x, unsigned char* y, const struct ed25519_pt* p) {
unsigned char z1[F25519_SIZE];
f25519_inv__distinct(z1, p->z);
f25519_mul__distinct(x, p->x, z1);
f25519_mul__distinct(y, p->y, z1);
f25519_normalize(x);
f25519_normalize(y);
}
__device__ __forceinline__ void ed25519_pack(unsigned char* c, const unsigned char* x, const unsigned char* y) {
unsigned char tmp[F25519_SIZE];
unsigned char parity;
f25519_copy(tmp, x);
f25519_normalize(tmp);
parity = (tmp[0] & 1) << 7;
f25519_copy(c, y);
f25519_normalize(c);
c[31] |= parity;
}
__device__ __forceinline__ void ed25519_add(struct ed25519_pt* r, const struct ed25519_pt* p1, const struct ed25519_pt* p2) {
unsigned char a[F25519_SIZE], b[F25519_SIZE], c[F25519_SIZE], d[F25519_SIZE];
unsigned char e[F25519_SIZE], f[F25519_SIZE], g[F25519_SIZE], h[F25519_SIZE];
f25519_sub(c, p1->y, p1->x);
f25519_sub(d, p2->y, p2->x);
f25519_mul__distinct(a, c, d);
f25519_add(c, p1->y, p1->x);
f25519_add(d, p2->y, p2->x);
f25519_mul__distinct(b, c, d);
f25519_mul__distinct(d, p1->t, p2->t);
f25519_mul__distinct(c, d, ed25519_k);
f25519_mul__distinct(d, p1->z, p2->z);
f25519_add(d, d, d);
f25519_sub(e, b, a);
f25519_sub(f, d, c);
f25519_add(g, d, c);
f25519_add(h, b, a);
f25519_mul__distinct(r->x, e, f);
f25519_mul__distinct(r->y, g, h);
f25519_mul__distinct(r->t, e, h);
f25519_mul__distinct(r->z, f, g);
}
__device__ __forceinline__ void ed25519_double(struct ed25519_pt* r, const struct ed25519_pt* p) {
unsigned char a[F25519_SIZE], b[F25519_SIZE], c[F25519_SIZE];
unsigned char e[F25519_SIZE], f[F25519_SIZE], g[F25519_SIZE], h[F25519_SIZE];
f25519_mul__distinct(a, p->x, p->x);
f25519_mul__distinct(b, p->y, p->y);
f25519_mul__distinct(c, p->z, p->z);
f25519_add(c, c, c);
f25519_add(f, p->x, p->y);
f25519_mul__distinct(e, f, f);
f25519_sub(e, e, a);
f25519_sub(e, e, b);
f25519_sub(g, b, a);
f25519_sub(f, g, c);
f25519_neg(h, b);
f25519_sub(h, h, a);
f25519_mul__distinct(r->x, e, f);
f25519_mul__distinct(r->y, g, h);
f25519_mul__distinct(r->t, e, h);
f25519_mul__distinct(r->z, f, g);
}
__device__ __forceinline__ void ed25519_copy(struct ed25519_pt* dst, const struct ed25519_pt* src) {
f25519_copy(dst->x, src->x);
f25519_copy(dst->y, src->y);
f25519_copy(dst->t, src->t);
f25519_copy(dst->z, src->z);
}
__device__ __forceinline__ void ed25519_smult(struct ed25519_pt* r_out, const struct ed25519_pt* p, const unsigned char* e) {
struct ed25519_pt r = ed25519_neutral;
for (int i = 255; i >= 0; i--) {
struct ed25519_pt s;
ed25519_double(&r, &r);
ed25519_add(&s, &r, p);
unsigned char bit = (e[i >> 3] >> (i & 7)) & 1;
f25519_select(r.x, r.x, s.x, bit);
f25519_select(r.y, r.y, s.y, bit);
f25519_select(r.z, r.z, s.z, bit);
f25519_select(r.t, r.t, s.t, bit);
}
ed25519_copy(r_out, &r);
}
__device__ __forceinline__ void ed25519_prepare(unsigned char* e) {
e[0] &= 0xf8;
e[31] &= 0x7f;
e[31] |= 0x40;
}

37
libs/edsign.cu Normal file
View File

@ -0,0 +1,37 @@
#include <edsign.cuh>
#include <ed25519.cuh>
#include <sha512.cuh>
__device__ void expand_key(unsigned char* expanded, const unsigned char* secret) {
struct sha512_state s;
sha512_init(&s);
sha512_final(&s, secret, 32);
sha512_get(&s, expanded, 0, 64);
ed25519_prepare(expanded);
}
__device__ void pp(unsigned char* packed, const struct ed25519_pt* p) {
unsigned char x[F25519_SIZE], y[F25519_SIZE];
ed25519_unproject(x, y, p);
ed25519_pack(packed, x, y);
}
__device__ void sm_pack(unsigned char* r, const unsigned char* k) {
struct ed25519_pt p;
ed25519_smult(&p, &ed25519_base, k);
pp(r, &p);
}
__device__ void edsign_sec_to_pub(unsigned char* pub, const unsigned char* secret) {
unsigned char expanded[64];
expand_key(expanded, secret);
sm_pack(pub, expanded);
}
__device__ void compact_wipe(void* data, unsigned long length) {
volatile unsigned char* p = (volatile unsigned char*)data;
while (length--) {
*p++ = 0;
}
}
__device__ void ed25519_keygen(unsigned char private_key[64], unsigned char public_key[32], unsigned char random_seed[32]) {
edsign_sec_to_pub(public_key, random_seed);
memcpy(private_key, random_seed, 32);
memcpy(private_key + 32, public_key, 32);
compact_wipe(random_seed, 32);
}

View File

@ -1,37 +1,10 @@
#pragma once #ifndef __EDSIGN_CUH
#include <ed25519.cuh> #define __EDSIGN_CUH
#include <sha512.cuh> #define F25519_SIZE 32
__device__ __forceinline__ void expand_key(unsigned char* expanded, const unsigned char* secret) { __device__ void expand_key(unsigned char* expanded, const unsigned char* secret);
struct sha512_state s; __device__ void pp(unsigned char* packed, const struct ed25519_pt* p);
sha512_init(&s); __device__ void sm_pack(unsigned char* r, const unsigned char* k);
sha512_final(&s, secret, 32); __device__ void edsign_sec_to_pub(unsigned char* pub, const unsigned char* secret);
sha512_get(&s, expanded, 0, 64); __device__ void compact_wipe(void* data, unsigned long length);
ed25519_prepare(expanded); __device__ void ed25519_keygen(unsigned char private_key[64], unsigned char public_key[32], unsigned char random_seed[32]);
} #endif
__device__ __forceinline__ void pp(unsigned char* packed, const struct ed25519_pt* p) {
unsigned char x[F25519_SIZE], y[F25519_SIZE];
ed25519_unproject(x, y, p);
ed25519_pack(packed, x, y);
}
__device__ __forceinline__ void sm_pack(unsigned char* r, const unsigned char* k) {
struct ed25519_pt p;
ed25519_smult(&p, &ed25519_base, k);
pp(r, &p);
}
__device__ __forceinline__ void edsign_sec_to_pub(unsigned char* pub, const unsigned char* secret) {
unsigned char expanded[64];
expand_key(expanded, secret);
sm_pack(pub, expanded);
}
__device__ void compact_wipe(void* data, unsigned long length) {
volatile unsigned char* p = (volatile unsigned char*)data;
while (length--) {
*p++ = 0;
}
}
__device__ void ed25519_keygen(unsigned char private_key[64], unsigned char public_key[32], unsigned char random_seed[32]) {
edsign_sec_to_pub(public_key, random_seed);
memcpy(private_key, random_seed, 32);
memcpy(private_key + 32, public_key, 32);
compact_wipe(random_seed, 32);
}

140
libs/f25519.cu Normal file
View File

@ -0,0 +1,140 @@
#include <f25519.cuh>
__device__ __forceinline__ void f25519_load(unsigned char* __restrict__ x, unsigned int c) {
#pragma unroll
for (unsigned int i = 0; i < sizeof(c); i++) {
x[i] = c & 0xFF;
c >>= 8;
}
#pragma unroll
for (unsigned int i = sizeof(c); i < F25519_SIZE; i++) {
x[i] = 0;
}
}
__device__ void f25519_copy(unsigned char* __restrict__ x, const unsigned char* __restrict__ a) {
#pragma unroll
for (int i = 0; i < F25519_SIZE; i++) {
x[i] = a[i];
}
}
__device__ void f25519_select(unsigned char* __restrict__ dst, const unsigned char* __restrict__ zero, const unsigned char* __restrict__ one, unsigned char cond) {
const unsigned char mask = 0 - cond;
#pragma unroll
for (int i = 0; i < F25519_SIZE; i++) {
dst[i] = zero[i] ^ (mask & (one[i] ^ zero[i]));
}
}
__device__ void f25519_normalize(unsigned char* __restrict__ x) {
unsigned char minusp[F25519_SIZE];
unsigned short c = (x[31] >> 7) * 19;
x[31] &= 127;
#pragma unroll
for (int i = 0; i < F25519_SIZE; i++) {
c += x[i];
x[i] = (unsigned char)c;
c >>= 8;
}
c = 19;
#pragma unroll
for (int i = 0; i + 1 < F25519_SIZE; i++) {
c += x[i];
minusp[i] = (unsigned char)c;
c >>= 8;
}
c += x[F25519_SIZE - 1] - 128;
minusp[F25519_SIZE - 1] = (unsigned char)c;
f25519_select(x, minusp, x, (c >> 15) & 1);
}
__device__ void f25519_add(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) {
unsigned short c = 0;
#pragma unroll
for (int i = 0; i < F25519_SIZE; i++) {
c = (c >> 8) + ((unsigned short)a[i]) + ((unsigned short)b[i]);
r[i] = (unsigned char)c;
}
r[F25519_SIZE - 1] &= 127;
c = (c >> 7) * 19;
#pragma unroll
for (int i = 0; i < F25519_SIZE; i++) {
c += r[i];
r[i] = (unsigned char)c;
c >>= 8;
}
}
__device__ void f25519_sub(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) {
unsigned c = 218;
int i = 0;
#pragma unroll
for (i = 0; i + 1 < F25519_SIZE; i++) {
c += 65280 + ((unsigned)a[i]) - ((unsigned)b[i]);
r[i] = (unsigned char)c;
c >>= 8;
}
c += ((unsigned)a[i]) - ((unsigned)b[i]);
r[i] = (unsigned char)(c & 127);
c = (c >> 7) * 19;
#pragma unroll
for (i = 0; i < F25519_SIZE; i++) {
c += r[i];
r[i] = (unsigned char)c;
c >>= 8;
}
}
__device__ void f25519_neg(unsigned char* __restrict__ r, const unsigned char* __restrict__ a) {
unsigned c = 218;
int i = 0;
#pragma unroll
for (i = 0; i + 1 < F25519_SIZE; i++) {
c += 65280 - ((unsigned)a[i]);
r[i] = (unsigned char)c;
c >>= 8;
}
c -= ((unsigned)a[i]);
r[i] = (unsigned char)(c & 127);
c = (c >> 7) * 19;
#pragma unroll
for (i = 0; i < F25519_SIZE; i++) {
c += r[i];
r[i] = (unsigned char)c;
c >>= 8;
}
}
__device__ void f25519_mul__distinct(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) {
unsigned c = 0;
#pragma unroll
for (int i = 0; i < F25519_SIZE; i++) {
c >>= 8;
for (int j = 0; j <= i; j++) {
c += ((unsigned)a[j]) * ((unsigned)b[i - j]);
}
for (int j = i + 1; j < F25519_SIZE; j++) {
c += ((unsigned)a[j]) * ((unsigned)b[F25519_SIZE + i - j]) * 38;
}
r[i] = (unsigned char)c;
}
r[F25519_SIZE - 1] &= 127;
c = (c >> 7) * 19;
#pragma unroll
for (int i = 0; i < F25519_SIZE; i++) {
c += r[i];
r[i] = (unsigned char)c;
c >>= 8;
}
}
__device__ void f25519_inv__distinct(unsigned char* __restrict__ r, const unsigned char* __restrict__ x) {
unsigned char s[F25519_SIZE];
f25519_mul__distinct(s, x, x);
f25519_mul__distinct(r, s, x);
#pragma unroll
for (int i = 0; i < 248; i++) {
f25519_mul__distinct(s, r, r);
f25519_mul__distinct(r, s, x);
}
f25519_mul__distinct(s, r, r);
f25519_mul__distinct(r, s, s);
f25519_mul__distinct(s, r, x);
f25519_mul__distinct(r, s, s);
f25519_mul__distinct(s, r, r);
f25519_mul__distinct(r, s, x);
f25519_mul__distinct(s, r, r);
f25519_mul__distinct(r, s, x);
}

View File

@ -1,141 +1,13 @@
#pragma once #ifndef __F25519_CUH
#define __F25519_CUH
#define F25519_SIZE 32 #define F25519_SIZE 32
__device__ __forceinline__ void f25519_load(unsigned char* __restrict__ x, unsigned int c) { __device__ void f25519_load(unsigned char* __restrict__ x, unsigned int c);
#pragma unroll __device__ void f25519_copy(unsigned char* __restrict__ x, const unsigned char* __restrict__ a);
for (unsigned int i = 0; i < sizeof(c); i++) { __device__ void f25519_select(unsigned char* __restrict__ dst, const unsigned char* __restrict__ zero, const unsigned char* __restrict__ one, unsigned char cond);
x[i] = c & 0xFF; __device__ void f25519_normalize(unsigned char* __restrict__ x);
c >>= 8; __device__ void f25519_add(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b);
} __device__ void f25519_sub(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b);
#pragma unroll __device__ void f25519_neg(unsigned char* __restrict__ r, const unsigned char* __restrict__ a);
for (unsigned int i = sizeof(c); i < F25519_SIZE; i++) { __device__ void f25519_mul__distinct(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b);
x[i] = 0; __device__ void f25519_inv__distinct(unsigned char* __restrict__ r, const unsigned char* __restrict__ x);
} #endif
}
__device__ __forceinline__ void f25519_copy(unsigned char* __restrict__ x, const unsigned char* __restrict__ a) {
#pragma unroll
for (int i = 0; i < F25519_SIZE; i++) {
x[i] = a[i];
}
}
__device__ __forceinline__ void f25519_select(unsigned char* __restrict__ dst, const unsigned char* __restrict__ zero, const unsigned char* __restrict__ one, unsigned char cond) {
const unsigned char mask = 0 - cond;
#pragma unroll
for (int i = 0; i < F25519_SIZE; i++) {
dst[i] = zero[i] ^ (mask & (one[i] ^ zero[i]));
}
}
__device__ __forceinline__ void f25519_normalize(unsigned char* __restrict__ x) {
unsigned char minusp[F25519_SIZE];
unsigned short c = (x[31] >> 7) * 19;
x[31] &= 127;
#pragma unroll
for (int i = 0; i < F25519_SIZE; i++) {
c += x[i];
x[i] = (unsigned char)c;
c >>= 8;
}
c = 19;
#pragma unroll
for (int i = 0; i + 1 < F25519_SIZE; i++) {
c += x[i];
minusp[i] = (unsigned char)c;
c >>= 8;
}
c += x[F25519_SIZE - 1] - 128;
minusp[F25519_SIZE - 1] = (unsigned char)c;
f25519_select(x, minusp, x, (c >> 15) & 1);
}
__device__ __forceinline__ void f25519_add(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) {
unsigned short c = 0;
#pragma unroll
for (int i = 0; i < F25519_SIZE; i++) {
c = (c >> 8) + ((unsigned short)a[i]) + ((unsigned short)b[i]);
r[i] = (unsigned char)c;
}
r[F25519_SIZE - 1] &= 127;
c = (c >> 7) * 19;
#pragma unroll
for (int i = 0; i < F25519_SIZE; i++) {
c += r[i];
r[i] = (unsigned char)c;
c >>= 8;
}
}
__device__ __forceinline__ void f25519_sub(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) {
unsigned c = 218;
int i = 0;
#pragma unroll
for (i = 0; i + 1 < F25519_SIZE; i++) {
c += 65280 + ((unsigned)a[i]) - ((unsigned)b[i]);
r[i] = (unsigned char)c;
c >>= 8;
}
c += ((unsigned)a[i]) - ((unsigned)b[i]);
r[i] = (unsigned char)(c & 127);
c = (c >> 7) * 19;
#pragma unroll
for (i = 0; i < F25519_SIZE; i++) {
c += r[i];
r[i] = (unsigned char)c;
c >>= 8;
}
}
__device__ __forceinline__ void f25519_neg(unsigned char* __restrict__ r, const unsigned char* __restrict__ a) {
unsigned c = 218;
int i = 0;
#pragma unroll
for (i = 0; i + 1 < F25519_SIZE; i++) {
c += 65280 - ((unsigned)a[i]);
r[i] = (unsigned char)c;
c >>= 8;
}
c -= ((unsigned)a[i]);
r[i] = (unsigned char)(c & 127);
c = (c >> 7) * 19;
#pragma unroll
for (i = 0; i < F25519_SIZE; i++) {
c += r[i];
r[i] = (unsigned char)c;
c >>= 8;
}
}
__device__ __forceinline__ void f25519_mul__distinct(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) {
unsigned c = 0;
#pragma unroll
for (int i = 0; i < F25519_SIZE; i++) {
c >>= 8;
for (int j = 0; j <= i; j++) {
c += ((unsigned)a[j]) * ((unsigned)b[i - j]);
}
for (int j = i + 1; j < F25519_SIZE; j++) {
c += ((unsigned)a[j]) * ((unsigned)b[F25519_SIZE + i - j]) * 38;
}
r[i] = (unsigned char)c;
}
r[F25519_SIZE - 1] &= 127;
c = (c >> 7) * 19;
#pragma unroll
for (int i = 0; i < F25519_SIZE; i++) {
c += r[i];
r[i] = (unsigned char)c;
c >>= 8;
}
}
__device__ __forceinline__ void f25519_inv__distinct(unsigned char* __restrict__ r, const unsigned char* __restrict__ x) {
unsigned char s[F25519_SIZE];
f25519_mul__distinct(s, x, x);
f25519_mul__distinct(r, s, x);
#pragma unroll
for (int i = 0; i < 248; i++) {
f25519_mul__distinct(s, r, r);
f25519_mul__distinct(r, s, x);
}
f25519_mul__distinct(s, r, r);
f25519_mul__distinct(r, s, s);
f25519_mul__distinct(s, r, x);
f25519_mul__distinct(r, s, s);
f25519_mul__distinct(s, r, r);
f25519_mul__distinct(r, s, x);
f25519_mul__distinct(s, r, r);
f25519_mul__distinct(r, s, x);
}

160
libs/sha512.cu Normal file
View File

@ -0,0 +1,160 @@
#include <sha512.cuh>
__device__ __constant__ sha512_state sha512_initial_state = { {
0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL,
} };
__device__ __constant__ unsigned long round_k[80] = {
0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL,
0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL,
0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL,
0xd807aa98a3030242ULL, 0x12835b0145706fbeULL,
0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL,
0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL,
0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL,
0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL,
0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL,
0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL,
0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
0x06ca6351e003826fULL, 0x142929670a0e6e70ULL,
0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL,
0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL,
0x81c2c92e47edaee6ULL, 0x92722c851482353bULL,
0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL,
0xd192e819d6ef5218ULL, 0xd69906245565a910ULL,
0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL,
0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL,
0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL,
0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL,
0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL,
0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL,
0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL,
0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL,
0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
0x28db77f523047d84ULL, 0x32caab7b40c72493ULL,
0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL,
0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL,
};
__device__ __forceinline__ unsigned long load64(const unsigned char* x) {
return ((unsigned long)x[0] << 56) | ((unsigned long)x[1] << 48) | ((unsigned long)x[2] << 40) | ((unsigned long)x[3] << 32)
| ((unsigned long)x[4] << 24) | ((unsigned long)x[5] << 16) | ((unsigned long)x[6] << 8) | ((unsigned long)x[7]);
}
__device__ __forceinline__ void store64(unsigned char* x, unsigned long v) {
x[0] = (unsigned char)(v >> 56);
x[1] = (unsigned char)(v >> 48);
x[2] = (unsigned char)(v >> 40);
x[3] = (unsigned char)(v >> 32);
x[4] = (unsigned char)(v >> 24);
x[5] = (unsigned char)(v >> 16);
x[6] = (unsigned char)(v >> 8);
x[7] = (unsigned char)(v);
}
__device__ __forceinline__ unsigned long rot64(unsigned long x, int bits) {
return (x >> bits) | (x << (64 - bits));
}
__device__ void sha512_block(sha512_state* s, const unsigned char* blk) {
unsigned long w[16];
#pragma unroll
for (int i = 0; i < 16; i++) {
w[i] = load64(blk + i * 8);
}
unsigned long a = s->h[0];
unsigned long b = s->h[1];
unsigned long c = s->h[2];
unsigned long d = s->h[3];
unsigned long e = s->h[4];
unsigned long f = s->h[5];
unsigned long g = s->h[6];
unsigned long h = s->h[7];
#pragma unroll
for (int i = 0; i < 80; i++) {
const int idx = i & 15;
const int idx1 = (i + 1) & 15;
const int idx7 = (i + 9) & 15;
const int idx14 = (i + 14) & 15;
unsigned long s0 = rot64(w[idx1], 1) ^ rot64(w[idx1], 8) ^ (w[idx1] >> 7);
unsigned long s1 = rot64(w[idx14], 19) ^ rot64(w[idx14], 61) ^ (w[idx14] >> 6);
unsigned long S0 = rot64(a, 28) ^ rot64(a, 34) ^ rot64(a, 39);
unsigned long S1 = rot64(e, 14) ^ rot64(e, 18) ^ rot64(e, 41);
unsigned long ch = (e & f) ^ ((~e) & g);
unsigned long temp1 = h + S1 + ch + round_k[i] + w[idx];
unsigned long maj = (a & b) ^ (a & c) ^ (b & c);
unsigned long temp2 = S0 + maj;
h = g;
g = f;
f = e;
e = d + temp1;
d = c;
c = b;
b = a;
a = temp1 + temp2;
w[idx] += s0 + w[idx7] + s1;
}
s->h[0] += a;
s->h[1] += b;
s->h[2] += c;
s->h[3] += d;
s->h[4] += e;
s->h[5] += f;
s->h[6] += g;
s->h[7] += h;
}
__device__ void sha512_final(sha512_state* s, const unsigned char* blk, unsigned long total_size) {
unsigned char temp[128];
memset(temp, 0, sizeof(temp));
unsigned long last_size = total_size & (128 - 1);
if (last_size) {
memcpy(temp, blk, last_size);
}
temp[last_size] = 0x80;
if (last_size > (128 - 9)) {
sha512_block(s, temp);
memset(temp, 0, sizeof(temp));
}
store64(temp + 128 - 8, total_size << 3);
sha512_block(s, temp);
}
__device__ void sha512_get(const sha512_state* s, unsigned char* hash, unsigned int offset, unsigned int len) {
if (offset > 128)
return;
if (len > 128 - offset)
len = 128 - offset;
unsigned int i = offset >> 3;
unsigned int off = offset & 7;
if (off) {
unsigned char tmp[8];
store64(tmp, s->h[i]);
unsigned int c = 8 - off;
if (c > len) c = len;
memcpy(hash, tmp + off, c);
len -= c;
hash += c;
i++;
}
while (len >= 8) {
store64(hash, s->h[i]);
hash += 8;
len -= 8;
i++;
}
if (len) {
unsigned char tmp[8];
store64(tmp, s->h[i]);
memcpy(hash, tmp, len);
}
}
__device__ void sha512_init(struct sha512_state* s) {
memcpy(s, &sha512_initial_state, sizeof(*s));
}

View File

@ -1,163 +1,15 @@
#pragma once #ifndef __SHA512_CUH
#define __SHA512_CUH
struct sha512_state { struct sha512_state {
unsigned long h[8]; unsigned long h[8];
}; };
__device__ __constant__ sha512_state sha512_initial_state = { { extern __device__ __constant__ sha512_state sha512_initial_state;
0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, extern __device__ __constant__ unsigned long round_k[80];
0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, __device__ __forceinline__ unsigned long load64(const unsigned char* x);
0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, __device__ __forceinline__ void store64(unsigned char* x, unsigned long v);
0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL, __device__ __forceinline__ unsigned long rot64(unsigned long x, int bits);
} }; __device__ void sha512_block(sha512_state* s, const unsigned char* blk);
__device__ __constant__ unsigned long round_k[80] = { __device__ void sha512_final(sha512_state* s, const unsigned char* blk, unsigned long total_size);
0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, __device__ void sha512_get(const sha512_state* s, unsigned char* hash, unsigned int offset, unsigned int len);
0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL, __device__ void sha512_init(sha512_state* s);
0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL, #endif
0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL,
0xd807aa98a3030242ULL, 0x12835b0145706fbeULL,
0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL,
0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL,
0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL,
0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL,
0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL,
0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL,
0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
0x06ca6351e003826fULL, 0x142929670a0e6e70ULL,
0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL,
0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL,
0x81c2c92e47edaee6ULL, 0x92722c851482353bULL,
0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL,
0xd192e819d6ef5218ULL, 0xd69906245565a910ULL,
0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL,
0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL,
0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL,
0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL,
0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL,
0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL,
0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL,
0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL,
0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
0x28db77f523047d84ULL, 0x32caab7b40c72493ULL,
0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL,
0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL,
};
__device__ __forceinline__ unsigned long load64(const unsigned char* x) {
return ((unsigned long)x[0] << 56) | ((unsigned long)x[1] << 48) | ((unsigned long)x[2] << 40) | ((unsigned long)x[3] << 32)
| ((unsigned long)x[4] << 24) | ((unsigned long)x[5] << 16) | ((unsigned long)x[6] << 8) | ((unsigned long)x[7]);
}
__device__ __forceinline__ void store64(unsigned char* x, unsigned long v) {
x[0] = (unsigned char)(v >> 56);
x[1] = (unsigned char)(v >> 48);
x[2] = (unsigned char)(v >> 40);
x[3] = (unsigned char)(v >> 32);
x[4] = (unsigned char)(v >> 24);
x[5] = (unsigned char)(v >> 16);
x[6] = (unsigned char)(v >> 8);
x[7] = (unsigned char)(v);
}
__device__ __forceinline__ unsigned long rot64(unsigned long x, int bits) {
return (x >> bits) | (x << (64 - bits));
}
__device__ void sha512_block(sha512_state* s, const unsigned char* blk) {
unsigned long w[16];
#pragma unroll
for (int i = 0; i < 16; i++) {
w[i] = load64(blk + i * 8);
}
unsigned long a = s->h[0];
unsigned long b = s->h[1];
unsigned long c = s->h[2];
unsigned long d = s->h[3];
unsigned long e = s->h[4];
unsigned long f = s->h[5];
unsigned long g = s->h[6];
unsigned long h = s->h[7];
#pragma unroll
for (int i = 0; i < 80; i++) {
const int idx = i & 15;
const int idx1 = (i + 1) & 15;
const int idx7 = (i + 9) & 15;
const int idx14 = (i + 14) & 15;
unsigned long s0 = rot64(w[idx1], 1) ^ rot64(w[idx1], 8) ^ (w[idx1] >> 7);
unsigned long s1 = rot64(w[idx14], 19) ^ rot64(w[idx14], 61) ^ (w[idx14] >> 6);
unsigned long S0 = rot64(a, 28) ^ rot64(a, 34) ^ rot64(a, 39);
unsigned long S1 = rot64(e, 14) ^ rot64(e, 18) ^ rot64(e, 41);
unsigned long ch = (e & f) ^ ((~e) & g);
unsigned long temp1 = h + S1 + ch + round_k[i] + w[idx];
unsigned long maj = (a & b) ^ (a & c) ^ (b & c);
unsigned long temp2 = S0 + maj;
h = g;
g = f;
f = e;
e = d + temp1;
d = c;
c = b;
b = a;
a = temp1 + temp2;
w[idx] += s0 + w[idx7] + s1;
}
s->h[0] += a;
s->h[1] += b;
s->h[2] += c;
s->h[3] += d;
s->h[4] += e;
s->h[5] += f;
s->h[6] += g;
s->h[7] += h;
}
__device__ void sha512_final(sha512_state* s, const unsigned char* blk, unsigned long total_size) {
unsigned char temp[128];
memset(temp, 0, sizeof(temp));
unsigned long last_size = total_size & (128 - 1);
if (last_size) {
memcpy(temp, blk, last_size);
}
temp[last_size] = 0x80;
if (last_size > (128 - 9)) {
sha512_block(s, temp);
memset(temp, 0, sizeof(temp));
}
store64(temp + 128 - 8, total_size << 3);
sha512_block(s, temp);
}
__device__ void sha512_get(const sha512_state* s, unsigned char* hash, unsigned int offset, unsigned int len) {
if (offset > 128)
return;
if (len > 128 - offset)
len = 128 - offset;
unsigned int i = offset >> 3;
unsigned int off = offset & 7;
if (off) {
unsigned char tmp[8];
store64(tmp, s->h[i]);
unsigned int c = 8 - off;
if (c > len) c = len;
memcpy(hash, tmp + off, c);
len -= c;
hash += c;
i++;
}
while (len >= 8) {
store64(hash, s->h[i]);
hash += 8;
len -= 8;
i++;
}
if (len) {
unsigned char tmp[8];
store64(tmp, s->h[i]);
memcpy(hash, tmp, len);
}
}
__device__ void sha512_init(struct sha512_state* s) {
memcpy(s, &sha512_initial_state, sizeof(*s));
}

66
libs/string.cu Normal file
View File

@ -0,0 +1,66 @@
__device__ int cstring_length(const char* s) {
int len = 0;
while (s[len]) len++;
return len;
}
__device__ int cstring_find(const char* s, const char* sub) {
int i, j;
int n = cstring_length(s);
int m = cstring_length(sub);
if (m == 0) return 0;
for (i = 0; i <= n - m; i++) {
for (j = 0; j < m; j++) {
if (s[i + j] != sub[j]) break;
}
if (j == m) return i;
}
return -1;
}
__device__ int cstring_to_ull(const char* s, unsigned* val) {
unsigned result = 0;
int i = 0;
if (s[0] == '0' && (s[1] == 'x' || s[1] == 'X')) {
i = 2;
}
if (s[i] == '\0') return 1;
for (; s[i]; i++) {
char c = s[i];
int digit;
if (c >= '0' && c <= '9') {
digit = c - '0';
} else if (c >= 'a' && c <= 'f') {
digit = 10 + (c - 'a');
} else if (c >= 'A' && c <= 'F') {
digit = 10 + (c - 'A');
} else {
return 1;
}
result = result * 16 + digit;
}
*val = result;
return 0;
}
__device__ void extract_substring(const char* src, int start, char* dest, int dest_size) {
int i = 0;
while (src[start + i] && i < dest_size - 1) {
dest[i] = src[start + i];
i++;
}
dest[i] = '\0';
}
__device__ void concat(const char* s1, const char* s2, char* out, int outSize) {
int i = 0, j = 0;
while (s1[i] && i < outSize - 1) {
out[i] = s1[i];
i++;
}
if (i < outSize - 1) {
out[i] = ' ';
i++;
}
while (s2[j] && i < outSize - 1) {
out[i] = s2[j];
i++; j++;
}
out[i] = '\0';
}

View File

@ -1,66 +1,8 @@
__device__ int cstring_length(const char* s) { #ifndef __STRING_CUH
int len = 0; #define __STRING_CUH
while (s[len]) len++; __device__ int cstring_length(const char* s);
return len; __device__ int cstring_find(const char* s, const char* sub);
} __device__ int cstring_to_ull(const char* s, unsigned* val);
__device__ int cstring_find(const char* s, const char* sub) { __device__ void extract_substring(const char* src, int start, char* dest, int dest_size);
int i, j; __device__ void concat(const char* s1, const char* s2, char* out, int outSize);
int n = cstring_length(s); #endif
int m = cstring_length(sub);
if (m == 0) return 0;
for (i = 0; i <= n - m; i++) {
for (j = 0; j < m; j++) {
if (s[i + j] != sub[j]) break;
}
if (j == m) return i;
}
return -1;
}
__device__ int cstring_to_ull(const char* s, unsigned* val) {
unsigned result = 0;
int i = 0;
if (s[0] == '0' && (s[1] == 'x' || s[1] == 'X')) {
i = 2;
}
if (s[i] == '\0') return 1;
for (; s[i]; i++) {
char c = s[i];
int digit;
if (c >= '0' && c <= '9') {
digit = c - '0';
} else if (c >= 'a' && c <= 'f') {
digit = 10 + (c - 'a');
} else if (c >= 'A' && c <= 'F') {
digit = 10 + (c - 'A');
} else {
return 1;
}
result = result * 16 + digit;
}
*val = result;
return 0;
}
__device__ void extract_substring(const char* src, int start, char* dest, int dest_size) {
int i = 0;
while (src[start + i] && i < dest_size - 1) {
dest[i] = src[start + i];
i++;
}
dest[i] = '\0';
}
__device__ void concat(const char* s1, const char* s2, char* out, int outSize) {
int i = 0, j = 0;
while (s1[i] && i < outSize - 1) {
out[i] = s1[i];
i++;
}
if (i < outSize - 1) {
out[i] = ' ';
i++;
}
while (s2[j] && i < outSize - 1) {
out[i] = s2[j];
i++; j++;
}
out[i] = '\0';
}

View File

@ -1,6 +1,5 @@
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
//#include <fstream>
#include <iomanip> #include <iomanip>
#include <vector> #include <vector>
#include <string> #include <string>
@ -133,7 +132,6 @@ void miner_thread() noexcept {
printf("Using seed: %lu\n", state); printf("Using seed: %lu\n", state);
while (true) { while (true) {
rmbytes(seed, sizeof(seed), state); rmbytes(seed, sizeof(seed), state);
//sign_keypair(keys.PublicKey, keys.PrivateKey, seed);
crypto_sign_ed25519_seed_keypair(keys.PublicKey, keys.PrivateKey, seed); crypto_sign_ed25519_seed_keypair(keys.PublicKey, keys.PrivateKey, seed);
if (ones = getZeros(keys.PublicKey); ones > conf.high.load()) { if (ones = getZeros(keys.PublicKey); ones > conf.high.load()) {
conf.high.store(ones); conf.high.store(ones);

View File

@ -1,8 +1,7 @@
#include <stdio.h>
#include <sstream>
#include <iostream>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <curand_kernel.h> #include <curand_kernel.h>
#include <sstream>
#include <iostream>
#include <sha512.cuh> #include <sha512.cuh>
#include <ed25519.cuh> #include <ed25519.cuh>
#include <edsign.cuh> #include <edsign.cuh>
@ -141,10 +140,9 @@ __device__ __forceinline__ void rmbytes(unsigned char* buf, unsigned long size,
} }
__device__ __forceinline__ void invertKey(const unsigned char* key, unsigned char* inverted) { __device__ __forceinline__ void invertKey(const unsigned char* key, unsigned char* inverted) {
#pragma unroll #pragma unroll
for (unsigned char i = 0; i < 32; i++) for (unsigned char i = 0; i < 32; i++) inverted[i] = key[i] ^ 0xFF;
inverted[i] = key[i] ^ 0xFF;
} }
__global__ void minerKernel(curandState* randStates) { __global__ void KeyGen(curandState* randStates) {
curandState localState = randStates[blockIdx.x * blockDim.x + threadIdx.x]; curandState localState = randStates[blockIdx.x * blockDim.x + threadIdx.x];
unsigned long long xorshiftState[2]; unsigned long long xorshiftState[2];
xorshiftState[0] = curand(&localState); xorshiftState[0] = curand(&localState);
@ -165,13 +163,12 @@ __global__ void minerKernel(curandState* randStates) {
} }
} }
int main(int argc, char* argv[]) { int main(int argc, char* argv[]) {
if (argc < 2) return 0;
int* d_result; int* d_result;
cudaMalloc((void**)&d_result, sizeof(int)); cudaMalloc((void**)&d_result, sizeof(int));
char** d_argv; char** d_argv;
cudaMalloc((void**)&d_argv, argc * sizeof(char*)); cudaMalloc((void**)&d_argv, argc * sizeof(char*));
for (int i = 0; i < argc; i++) { for (int i = 0; i < argc; i++) {
size_t len = strlen(argv[i]) + 1; unsigned long len = strlen(argv[i]) + 1;
char* d_str; char* d_str;
cudaMalloc((void**)&d_str, len); cudaMalloc((void**)&d_str, len);
cudaMemcpy(d_str, argv[i], len, cudaMemcpyHostToDevice); cudaMemcpy(d_str, argv[i], len, cudaMemcpyHostToDevice);
@ -185,7 +182,7 @@ int main(int argc, char* argv[]) {
cudaDeviceProp prop; cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0); cudaGetDeviceProperties(&prop, 0);
int mBpSM; int mBpSM;
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&mBpSM, minerKernel, threadsPerBlock, 0); cudaOccupancyMaxActiveBlocksPerMultiprocessor(&mBpSM, KeyGen, threadsPerBlock, 0);
int SMs = prop.multiProcessorCount; int SMs = prop.multiProcessorCount;
int maxBlocks = mBpSM * SMs; int maxBlocks = mBpSM * SMs;
const int totalThreads = maxBlocks * threadsPerBlock; const int totalThreads = maxBlocks * threadsPerBlock;
@ -193,12 +190,12 @@ int main(int argc, char* argv[]) {
printf("maxBlocks: %d\n", maxBlocks); printf("maxBlocks: %d\n", maxBlocks);
printf("totalThreads: %d\n", totalThreads); printf("totalThreads: %d\n", totalThreads);
printf("MaxBlocksPerSM: %d\n", mBpSM); printf("MaxBlocksPerSM: %d\n", mBpSM);
printf("Current config: <<<%d,%d>>>\n", totalThreads / threadsPerBlock, threadsPerBlock); printf("BlocksThreads: %d:%d\n", totalThreads / threadsPerBlock, threadsPerBlock);
curandState* rst; curandState* rst;
cudaMalloc(&rst, totalThreads * sizeof(curandState)); cudaMalloc(&rst, totalThreads * sizeof(curandState));
initRand<<<totalThreads / threadsPerBlock, threadsPerBlock>>>(rst); initRand<<<totalThreads / threadsPerBlock, threadsPerBlock>>>(rst);
cudaDeviceSynchronize(); cudaDeviceSynchronize();
minerKernel<<<totalThreads / threadsPerBlock, threadsPerBlock>>>(rst); KeyGen<<<totalThreads / threadsPerBlock, threadsPerBlock>>>(rst);
cudaDeviceSynchronize(); cudaDeviceSynchronize();
cudaFree(rst); cudaFree(rst);
return 0; return 0;