code cleanup
This commit is contained in:
parent
b95318103a
commit
9ce2eba09f
@ -1,10 +1,8 @@
|
||||
#ifndef __ED25519_CUH
|
||||
#define __ED25519_CUH
|
||||
#include <stdint.h>
|
||||
#pragma once
|
||||
#include <f25519.cuh>
|
||||
#define F25519_SIZE 32
|
||||
struct ed25519_pt {
|
||||
uint8_t x[F25519_SIZE], y[F25519_SIZE], t[F25519_SIZE], z[F25519_SIZE];
|
||||
unsigned char x[F25519_SIZE], y[F25519_SIZE], t[F25519_SIZE], z[F25519_SIZE];
|
||||
};
|
||||
__device__ __constant__ struct ed25519_pt ed25519_base = {
|
||||
{0x1a,0xd5,0x25,0x8f,0x60,0x2d,0x56,0xc9,0xb2,0xa7,0x25,0x95,0x60,0xc7,0x2c,0x69,
|
||||
@ -18,31 +16,31 @@ __device__ __constant__ struct ed25519_pt ed25519_base = {
|
||||
__device__ __constant__ struct ed25519_pt ed25519_neutral = {
|
||||
{0}, {1,0}, {0}, {1,0}
|
||||
};
|
||||
__device__ __constant__ uint8_t ed25519_d[F25519_SIZE] = {
|
||||
__device__ __constant__ unsigned char ed25519_d[F25519_SIZE] = {
|
||||
0xa3,0x78,0x59,0x13,0xca,0x4d,0xeb,0x75,0xab,0xd8,0x41,0x41,0x4d,0x0a,0x70,0x00,
|
||||
0x98,0xe8,0x79,0x77,0x79,0x40,0xc7,0x8c,0x73,0xfe,0x6f,0x2b,0xee,0x6c,0x03,0x52
|
||||
};
|
||||
__device__ __constant__ uint8_t ed25519_k[F25519_SIZE] = {
|
||||
__device__ __constant__ unsigned char ed25519_k[F25519_SIZE] = {
|
||||
0x59,0xf1,0xb2,0x26,0x94,0x9b,0xd6,0xeb,0x56,0xb1,0x83,0x82,0x9a,0x14,0xe0,0x00,
|
||||
0x30,0xd1,0xf3,0xee,0xf2,0x80,0x8e,0x19,0xe7,0xfc,0xdf,0x56,0xdc,0xd9,0x06,0x24
|
||||
};
|
||||
__device__ __forceinline__ void ed25519_project(struct ed25519_pt* p, const uint8_t* x, const uint8_t* y) {
|
||||
__device__ __forceinline__ void ed25519_project(struct ed25519_pt* p, const unsigned char* x, const unsigned char* y) {
|
||||
f25519_copy(p->x, x);
|
||||
f25519_copy(p->y, y);
|
||||
f25519_load(p->z, 1);
|
||||
f25519_mul__distinct(p->t, x, y);
|
||||
}
|
||||
__device__ __forceinline__ void ed25519_unproject(uint8_t* x, uint8_t* y, const struct ed25519_pt* p) {
|
||||
uint8_t z1[F25519_SIZE];
|
||||
__device__ __forceinline__ void ed25519_unproject(unsigned char* x, unsigned char* y, const struct ed25519_pt* p) {
|
||||
unsigned char z1[F25519_SIZE];
|
||||
f25519_inv__distinct(z1, p->z);
|
||||
f25519_mul__distinct(x, p->x, z1);
|
||||
f25519_mul__distinct(y, p->y, z1);
|
||||
f25519_normalize(x);
|
||||
f25519_normalize(y);
|
||||
}
|
||||
__device__ __forceinline__ void ed25519_pack(uint8_t* c, const uint8_t* x, const uint8_t* y) {
|
||||
uint8_t tmp[F25519_SIZE];
|
||||
uint8_t parity;
|
||||
__device__ __forceinline__ void ed25519_pack(unsigned char* c, const unsigned char* x, const unsigned char* y) {
|
||||
unsigned char tmp[F25519_SIZE];
|
||||
unsigned char parity;
|
||||
f25519_copy(tmp, x);
|
||||
f25519_normalize(tmp);
|
||||
parity = (tmp[0] & 1) << 7;
|
||||
@ -50,9 +48,10 @@ __device__ __forceinline__ void ed25519_pack(uint8_t* c, const uint8_t* x, const
|
||||
f25519_normalize(c);
|
||||
c[31] |= parity;
|
||||
}
|
||||
__device__ __forceinline__ uint8_t ed25519_try_unpack(uint8_t* x, uint8_t* y, const uint8_t* comp) {
|
||||
/*
|
||||
__device__ __forceinline__ unsigned char ed25519_try_unpack(unsigned char* x, unsigned char* y, const unsigned char* comp) {
|
||||
int parity = comp[31] >> 7;
|
||||
uint8_t a[F25519_SIZE], b[F25519_SIZE], c_[F25519_SIZE];
|
||||
unsigned char a[F25519_SIZE], b[F25519_SIZE], c_[F25519_SIZE];
|
||||
f25519_copy(y, comp);
|
||||
y[31] &= 127;
|
||||
f25519_mul__distinct(c_, y, y);
|
||||
@ -69,9 +68,10 @@ __device__ __forceinline__ uint8_t ed25519_try_unpack(uint8_t* x, uint8_t* y, co
|
||||
f25519_normalize(c_);
|
||||
return f25519_eq(a, c_);
|
||||
}
|
||||
*/
|
||||
__device__ __forceinline__ void ed25519_add(struct ed25519_pt* r, const struct ed25519_pt* p1, const struct ed25519_pt* p2) {
|
||||
uint8_t a[F25519_SIZE], b[F25519_SIZE], c[F25519_SIZE], d[F25519_SIZE];
|
||||
uint8_t e[F25519_SIZE], f[F25519_SIZE], g[F25519_SIZE], h[F25519_SIZE];
|
||||
unsigned char a[F25519_SIZE], b[F25519_SIZE], c[F25519_SIZE], d[F25519_SIZE];
|
||||
unsigned char e[F25519_SIZE], f[F25519_SIZE], g[F25519_SIZE], h[F25519_SIZE];
|
||||
f25519_sub(c, p1->y, p1->x);
|
||||
f25519_sub(d, p2->y, p2->x);
|
||||
f25519_mul__distinct(a, c, d);
|
||||
@ -92,8 +92,8 @@ __device__ __forceinline__ void ed25519_add(struct ed25519_pt* r, const struct e
|
||||
f25519_mul__distinct(r->z, f, g);
|
||||
}
|
||||
__device__ __forceinline__ void ed25519_double(struct ed25519_pt* r, const struct ed25519_pt* p) {
|
||||
uint8_t a[F25519_SIZE], b[F25519_SIZE], c[F25519_SIZE];
|
||||
uint8_t e[F25519_SIZE], f[F25519_SIZE], g[F25519_SIZE], h[F25519_SIZE];
|
||||
unsigned char a[F25519_SIZE], b[F25519_SIZE], c[F25519_SIZE];
|
||||
unsigned char e[F25519_SIZE], f[F25519_SIZE], g[F25519_SIZE], h[F25519_SIZE];
|
||||
f25519_mul__distinct(a, p->x, p->x);
|
||||
f25519_mul__distinct(b, p->y, p->y);
|
||||
f25519_mul__distinct(c, p->z, p->z);
|
||||
@ -117,13 +117,13 @@ __device__ __forceinline__ void ed25519_copy(struct ed25519_pt* dst, const struc
|
||||
f25519_copy(dst->t, src->t);
|
||||
f25519_copy(dst->z, src->z);
|
||||
}
|
||||
__device__ __forceinline__ void ed25519_smult(struct ed25519_pt* r_out, const struct ed25519_pt* p, const uint8_t* e) {
|
||||
__device__ __forceinline__ void ed25519_smult(struct ed25519_pt* r_out, const struct ed25519_pt* p, const unsigned char* e) {
|
||||
struct ed25519_pt r = ed25519_neutral;
|
||||
for (int i = 255; i >= 0; i--) {
|
||||
struct ed25519_pt s;
|
||||
ed25519_double(&r, &r);
|
||||
ed25519_add(&s, &r, p);
|
||||
uint8_t bit = (e[i >> 3] >> (i & 7)) & 1;
|
||||
unsigned char bit = (e[i >> 3] >> (i & 7)) & 1;
|
||||
f25519_select(r.x, r.x, s.x, bit);
|
||||
f25519_select(r.y, r.y, s.y, bit);
|
||||
f25519_select(r.z, r.z, s.z, bit);
|
||||
@ -131,9 +131,8 @@ __device__ __forceinline__ void ed25519_smult(struct ed25519_pt* r_out, const st
|
||||
}
|
||||
ed25519_copy(r_out, &r);
|
||||
}
|
||||
__device__ __forceinline__ void ed25519_prepare(uint8_t* e) {
|
||||
__device__ __forceinline__ void ed25519_prepare(unsigned char* e) {
|
||||
e[0] &= 0xf8;
|
||||
e[31] &= 0x7f;
|
||||
e[31] |= 0x40;
|
||||
}
|
||||
#endif
|
||||
}
|
100
libs/edsign.cuh
100
libs/edsign.cuh
@ -1,83 +1,76 @@
|
||||
#ifndef __EDSIGN_CUH
|
||||
#define __EDSIGN_CUH
|
||||
#pragma once
|
||||
#include <ed25519.cuh>
|
||||
#ifndef COMPACT_DISABLE_ED25519
|
||||
#include <sha512.cuh>
|
||||
#include <fprime.cuh>
|
||||
#include <cuda_runtime.h>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#define EXPANDED_SIZE 64
|
||||
#define EDSIGN_SECRET_KEY_SIZE 32
|
||||
#define EDSIGN_PUBLIC_KEY_SIZE 32
|
||||
#define EDSIGN_SIGNATURE_SIZE 64
|
||||
#define SHA512_HASH_SIZE 64
|
||||
__device__ __constant__ uint8_t ed25519_order[FPRIME_SIZE] = {
|
||||
0xed, 0xd3, 0xf5, 0x5c, 0x1a, 0x63, 0x12, 0x58,
|
||||
0xd6, 0x9c, 0xf7, 0xa2, 0xde, 0xf9, 0xde, 0x14,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10
|
||||
/*
|
||||
__device__ __constant__ unsigned char ed25519_order[32] = {
|
||||
0xed, 0xd3, 0xf5, 0x5c, 0x1a, 0x63, 0x12, 0x58, 0xd6, 0x9c, 0xf7, 0xa2, 0xde, 0xf9, 0xde, 0x14,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10
|
||||
};
|
||||
__device__ __forceinline__ void expand_key(uint8_t* expanded, const uint8_t* secret) {
|
||||
*/
|
||||
__device__ __forceinline__ void expand_key(unsigned char* expanded, const unsigned char* secret) {
|
||||
struct sha512_state s;
|
||||
sha512_init(&s);
|
||||
sha512_final(&s, secret, EDSIGN_SECRET_KEY_SIZE);
|
||||
sha512_get(&s, expanded, 0, EXPANDED_SIZE);
|
||||
sha512_final(&s, secret, 32);
|
||||
sha512_get(&s, expanded, 0, 64);
|
||||
ed25519_prepare(expanded);
|
||||
}
|
||||
__device__ __forceinline__ uint8_t upp(struct ed25519_pt* p, const uint8_t* packed) {
|
||||
uint8_t x[F25519_SIZE], y[F25519_SIZE];
|
||||
uint8_t ok = ed25519_try_unpack(x, y, packed);
|
||||
/*
|
||||
__device__ __forceinline__ unsigned char upp(struct ed25519_pt* p, const unsigned char* packed) {
|
||||
unsigned char x[F25519_SIZE], y[F25519_SIZE];
|
||||
unsigned char ok = ed25519_try_unpack(x, y, packed);
|
||||
ed25519_project(p, x, y);
|
||||
return ok;
|
||||
}
|
||||
__device__ __forceinline__ void pp(uint8_t* packed, const struct ed25519_pt* p) {
|
||||
uint8_t x[F25519_SIZE], y[F25519_SIZE];
|
||||
*/
|
||||
__device__ __forceinline__ void pp(unsigned char* packed, const struct ed25519_pt* p) {
|
||||
unsigned char x[F25519_SIZE], y[F25519_SIZE];
|
||||
ed25519_unproject(x, y, p);
|
||||
ed25519_pack(packed, x, y);
|
||||
}
|
||||
__device__ __forceinline__ void sm_pack(uint8_t* r, const uint8_t* k) {
|
||||
__device__ __forceinline__ void sm_pack(unsigned char* r, const unsigned char* k) {
|
||||
struct ed25519_pt p;
|
||||
ed25519_smult(&p, &ed25519_base, k);
|
||||
pp(r, &p);
|
||||
}
|
||||
__device__ __forceinline__ void edsign_sec_to_pub(uint8_t* pub, const uint8_t* secret) {
|
||||
uint8_t expanded[EXPANDED_SIZE];
|
||||
__device__ __forceinline__ void edsign_sec_to_pub(unsigned char* pub, const unsigned char* secret) {
|
||||
unsigned char expanded[64];
|
||||
expand_key(expanded, secret);
|
||||
sm_pack(pub, expanded);
|
||||
}
|
||||
__device__ __forceinline__ void hash_with_prefix(uint8_t* out_fp, uint8_t* init_block, unsigned int prefix_size, const uint8_t* message, size_t len) {
|
||||
/*
|
||||
__device__ __forceinline__ void hash_with_prefix(unsigned char* out_fp, unsigned char* init_block, unsigned int prefix_size, const unsigned char* message, unsigned long len) {
|
||||
struct sha512_state s;
|
||||
sha512_init(&s);
|
||||
if (len < SHA512_BLOCK_SIZE && len + prefix_size < SHA512_BLOCK_SIZE) {
|
||||
if (len < 128 && len + prefix_size < 128) {
|
||||
memcpy(init_block + prefix_size, message, len);
|
||||
sha512_final(&s, init_block, len + prefix_size);
|
||||
} else {
|
||||
size_t i;
|
||||
memcpy(init_block + prefix_size, message, SHA512_BLOCK_SIZE - prefix_size);
|
||||
unsigned long i;
|
||||
memcpy(init_block + prefix_size, message, 128 - prefix_size);
|
||||
sha512_block(&s, init_block);
|
||||
for (i = SHA512_BLOCK_SIZE - prefix_size; i + SHA512_BLOCK_SIZE <= len; i += SHA512_BLOCK_SIZE) {
|
||||
for (i = 128 - prefix_size; i + 128 <= len; i += 128) {
|
||||
sha512_block(&s, message + i);
|
||||
}
|
||||
sha512_final(&s, message + i, len - i + prefix_size);
|
||||
}
|
||||
sha512_get(&s, init_block, 0, SHA512_HASH_SIZE);
|
||||
fprime_from_bytes(out_fp, init_block, SHA512_HASH_SIZE, ed25519_order);
|
||||
sha512_get(&s, init_block, 0, 64);
|
||||
fprime_from_bytes(out_fp, init_block, 64, ed25519_order);
|
||||
}
|
||||
__device__ __forceinline__ void generate_k(uint8_t* k, const uint8_t* kgen_key, const uint8_t* message, size_t len) {
|
||||
uint8_t block[SHA512_BLOCK_SIZE];
|
||||
__device__ __forceinline__ void generate_k(unsigned char* k, const unsigned char* kgen_key, const unsigned char* message, unsigned long len) {
|
||||
unsigned char block[128];
|
||||
memcpy(block, kgen_key, 32);
|
||||
hash_with_prefix(k, block, 32, message, len);
|
||||
}
|
||||
__device__ __forceinline__ void hash_message(uint8_t* z, const uint8_t* r, const uint8_t* a, const uint8_t* m, size_t len) {
|
||||
uint8_t block[SHA512_BLOCK_SIZE];
|
||||
__device__ __forceinline__ void hash_message(unsigned char* z, const unsigned char* r, const unsigned char* a, const unsigned char* m, unsigned long len) {
|
||||
unsigned char block[128];
|
||||
memcpy(block, r, 32);
|
||||
memcpy(block + 32, a, 32);
|
||||
hash_with_prefix(z, block, 64, m, len);
|
||||
}
|
||||
__device__ void edsign_sign(uint8_t* signature, const uint8_t* pub, const uint8_t* secret, const uint8_t* message, size_t len) {
|
||||
uint8_t expanded[EXPANDED_SIZE];
|
||||
uint8_t e[FPRIME_SIZE], s[FPRIME_SIZE], k[FPRIME_SIZE], z[FPRIME_SIZE];
|
||||
__device__ void edsign_sign(unsigned char* signature, const unsigned char* pub, const unsigned char* secret, const unsigned char* message, unsigned long len) {
|
||||
unsigned char expanded[64];
|
||||
unsigned char e[32], s[32], k[32], z[32];
|
||||
expand_key(expanded, secret);
|
||||
generate_k(k, expanded + 32, message, len);
|
||||
sm_pack(signature, k);
|
||||
@ -87,10 +80,10 @@ __device__ void edsign_sign(uint8_t* signature, const uint8_t* pub, const uint8_
|
||||
fprime_add(s, k, ed25519_order);
|
||||
memcpy(signature + 32, s, 32);
|
||||
}
|
||||
__device__ uint8_t edsign_verify(const uint8_t* signature, const uint8_t* pub, const uint8_t* message, size_t len) {
|
||||
__device__ unsigned char edsign_verify(const unsigned char* signature, const unsigned char* pub, const unsigned char* message, unsigned long len) {
|
||||
struct ed25519_pt p, q;
|
||||
uint8_t lhs[F25519_SIZE], rhs[F25519_SIZE], z[FPRIME_SIZE];
|
||||
uint8_t ok = 1;
|
||||
unsigned char lhs[F25519_SIZE], rhs[F25519_SIZE], z[32];
|
||||
unsigned char ok = 1;
|
||||
hash_message(z, signature, pub, message, len);
|
||||
sm_pack(lhs, signature + 32);
|
||||
ok &= upp(&p, pub);
|
||||
@ -100,21 +93,4 @@ __device__ uint8_t edsign_verify(const uint8_t* signature, const uint8_t* pub, c
|
||||
pp(rhs, &p);
|
||||
return ok & f25519_eq(lhs, rhs);
|
||||
}
|
||||
__global__ void sign_kernel(uint8_t* d_signatures, const uint8_t* d_pubs, const uint8_t* d_secrets, const uint8_t* d_messages, const size_t* d_message_lens, int num_messages) {
|
||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (idx >= num_messages) return;
|
||||
uint8_t* signature = d_signatures + idx * EDSIGN_SIGNATURE_SIZE;
|
||||
const uint8_t* pub = d_pubs + idx * EDSIGN_PUBLIC_KEY_SIZE;
|
||||
const uint8_t* secret = d_secrets + idx * EDSIGN_SECRET_KEY_SIZE;
|
||||
const uint8_t* message = d_messages;
|
||||
size_t len = d_message_lens[idx];
|
||||
edsign_sign(signature, pub, secret, message, len);
|
||||
}
|
||||
void launch_sign_kernel(uint8_t* d_signatures, const uint8_t* d_pubs, const uint8_t* d_secrets, const uint8_t* d_messages, const size_t* d_message_lens, int num_messages) {
|
||||
int threadsPerBlock = 256;
|
||||
int blocksPerGrid = (num_messages + threadsPerBlock - 1) / threadsPerBlock;
|
||||
sign_kernel << <blocksPerGrid, threadsPerBlock >> > (d_signatures, d_pubs, d_secrets, d_messages, d_message_lens, num_messages);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
*/
|
101
libs/f25519.cuh
101
libs/f25519.cuh
@ -1,10 +1,8 @@
|
||||
#ifndef __F25519_CUH
|
||||
#define __F25519_CUH
|
||||
#include <stdint.h>
|
||||
#pragma once
|
||||
#define F25519_SIZE 32
|
||||
__device__ __constant__ uint8_t f25519_zero[F25519_SIZE] = { 0 };
|
||||
__device__ __constant__ uint8_t f25519_one[F25519_SIZE] = { 1 };
|
||||
__device__ __forceinline__ void f25519_load(uint8_t* __restrict__ x, uint32_t c) {
|
||||
__device__ __constant__ unsigned char f25519_zero[F25519_SIZE] = { 0 };
|
||||
__device__ __constant__ unsigned char f25519_one[F25519_SIZE] = { 1 };
|
||||
__device__ __forceinline__ void f25519_load(unsigned char* __restrict__ x, unsigned int c) {
|
||||
#pragma unroll
|
||||
for (unsigned int i = 0; i < sizeof(c); i++) {
|
||||
x[i] = c & 0xFF;
|
||||
@ -15,44 +13,42 @@ __device__ __forceinline__ void f25519_load(uint8_t* __restrict__ x, uint32_t c)
|
||||
x[i] = 0;
|
||||
}
|
||||
}
|
||||
__device__ __forceinline__ void f25519_copy(uint8_t* __restrict__ x, const uint8_t* __restrict__ a) {
|
||||
__device__ __forceinline__ void f25519_copy(unsigned char* __restrict__ x, const unsigned char* __restrict__ a) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < F25519_SIZE; i++) {
|
||||
x[i] = a[i];
|
||||
}
|
||||
}
|
||||
__device__ __forceinline__ void f25519_select(uint8_t* __restrict__ dst,
|
||||
const uint8_t* __restrict__ zero,
|
||||
const uint8_t* __restrict__ one, uint8_t cond) {
|
||||
const uint8_t mask = 0 - cond;
|
||||
__device__ __forceinline__ void f25519_select(unsigned char* __restrict__ dst, const unsigned char* __restrict__ zero, const unsigned char* __restrict__ one, unsigned char cond) {
|
||||
const unsigned char mask = 0 - cond;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < F25519_SIZE; i++) {
|
||||
dst[i] = zero[i] ^ (mask & (one[i] ^ zero[i]));
|
||||
}
|
||||
}
|
||||
__device__ __forceinline__ void f25519_normalize(uint8_t* __restrict__ x) {
|
||||
uint8_t minusp[F25519_SIZE];
|
||||
uint16_t c = (x[31] >> 7) * 19;
|
||||
__device__ __forceinline__ void f25519_normalize(unsigned char* __restrict__ x) {
|
||||
unsigned char minusp[F25519_SIZE];
|
||||
unsigned short c = (x[31] >> 7) * 19;
|
||||
x[31] &= 127;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < F25519_SIZE; i++) {
|
||||
c += x[i];
|
||||
x[i] = (uint8_t)c;
|
||||
x[i] = (unsigned char)c;
|
||||
c >>= 8;
|
||||
}
|
||||
c = 19;
|
||||
#pragma unroll
|
||||
for (int i = 0; i + 1 < F25519_SIZE; i++) {
|
||||
c += x[i];
|
||||
minusp[i] = (uint8_t)c;
|
||||
minusp[i] = (unsigned char)c;
|
||||
c >>= 8;
|
||||
}
|
||||
c += x[F25519_SIZE - 1] - 128;
|
||||
minusp[F25519_SIZE - 1] = (uint8_t)c;
|
||||
minusp[F25519_SIZE - 1] = (unsigned char)c;
|
||||
f25519_select(x, minusp, x, (c >> 15) & 1);
|
||||
}
|
||||
__device__ __forceinline__ uint8_t f25519_eq(const uint8_t* __restrict__ x, const uint8_t* __restrict__ y) {
|
||||
uint8_t s = 0;
|
||||
__device__ __forceinline__ unsigned char f25519_eq(const unsigned char* __restrict__ x, const unsigned char* __restrict__ y) {
|
||||
unsigned char s = 0;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < F25519_SIZE; i++)
|
||||
s |= x[i] ^ y[i];
|
||||
@ -61,68 +57,61 @@ __device__ __forceinline__ uint8_t f25519_eq(const uint8_t* __restrict__ x, cons
|
||||
s |= s >> 1;
|
||||
return (s ^ 1) & 1;
|
||||
}
|
||||
__device__ __forceinline__ void f25519_add(uint8_t* __restrict__ r,
|
||||
const uint8_t* __restrict__ a,
|
||||
const uint8_t* __restrict__ b) {
|
||||
uint16_t c = 0;
|
||||
__device__ __forceinline__ void f25519_add(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) {
|
||||
unsigned short c = 0;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < F25519_SIZE; i++) {
|
||||
c = (c >> 8) + ((uint16_t)a[i]) + ((uint16_t)b[i]);
|
||||
r[i] = (uint8_t)c;
|
||||
c = (c >> 8) + ((unsigned short)a[i]) + ((unsigned short)b[i]);
|
||||
r[i] = (unsigned char)c;
|
||||
}
|
||||
r[F25519_SIZE - 1] &= 127;
|
||||
c = (c >> 7) * 19;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < F25519_SIZE; i++) {
|
||||
c += r[i];
|
||||
r[i] = (uint8_t)c;
|
||||
r[i] = (unsigned char)c;
|
||||
c >>= 8;
|
||||
}
|
||||
}
|
||||
__device__ __forceinline__ void f25519_sub(uint8_t* __restrict__ r,
|
||||
const uint8_t* __restrict__ a,
|
||||
const uint8_t* __restrict__ b) {
|
||||
__device__ __forceinline__ void f25519_sub(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) {
|
||||
uint32_t c = 218;
|
||||
int i = 0;
|
||||
#pragma unroll
|
||||
for (i = 0; i + 1 < F25519_SIZE; i++) {
|
||||
c += 65280 + ((uint32_t)a[i]) - ((uint32_t)b[i]);
|
||||
r[i] = (uint8_t)c;
|
||||
r[i] = (unsigned char)c;
|
||||
c >>= 8;
|
||||
}
|
||||
c += ((uint32_t)a[i]) - ((uint32_t)b[i]);
|
||||
r[i] = (uint8_t)(c & 127);
|
||||
r[i] = (unsigned char)(c & 127);
|
||||
c = (c >> 7) * 19;
|
||||
#pragma unroll
|
||||
for (i = 0; i < F25519_SIZE; i++) {
|
||||
c += r[i];
|
||||
r[i] = (uint8_t)c;
|
||||
r[i] = (unsigned char)c;
|
||||
c >>= 8;
|
||||
}
|
||||
}
|
||||
__device__ __forceinline__ void f25519_neg(uint8_t* __restrict__ r,
|
||||
const uint8_t* __restrict__ a) {
|
||||
__device__ __forceinline__ void f25519_neg(unsigned char* __restrict__ r, const unsigned char* __restrict__ a) {
|
||||
uint32_t c = 218;
|
||||
int i = 0;
|
||||
#pragma unroll
|
||||
for (i = 0; i + 1 < F25519_SIZE; i++) {
|
||||
c += 65280 - ((uint32_t)a[i]);
|
||||
r[i] = (uint8_t)c;
|
||||
r[i] = (unsigned char)c;
|
||||
c >>= 8;
|
||||
}
|
||||
c -= ((uint32_t)a[i]);
|
||||
r[i] = (uint8_t)(c & 127);
|
||||
r[i] = (unsigned char)(c & 127);
|
||||
c = (c >> 7) * 19;
|
||||
#pragma unroll
|
||||
for (i = 0; i < F25519_SIZE; i++) {
|
||||
c += r[i];
|
||||
r[i] = (uint8_t)c;
|
||||
r[i] = (unsigned char)c;
|
||||
c >>= 8;
|
||||
}
|
||||
}
|
||||
__device__ __forceinline__ void f25519_mul__distinct(uint8_t* __restrict__ r,
|
||||
const uint8_t* __restrict__ a,
|
||||
const uint8_t* __restrict__ b) {
|
||||
__device__ __forceinline__ void f25519_mul__distinct(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) {
|
||||
uint32_t c = 0;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < F25519_SIZE; i++) {
|
||||
@ -133,37 +122,37 @@ __device__ __forceinline__ void f25519_mul__distinct(uint8_t* __restrict__ r,
|
||||
for (int j = i + 1; j < F25519_SIZE; j++) {
|
||||
c += ((uint32_t)a[j]) * ((uint32_t)b[F25519_SIZE + i - j]) * 38;
|
||||
}
|
||||
r[i] = (uint8_t)c;
|
||||
r[i] = (unsigned char)c;
|
||||
}
|
||||
r[F25519_SIZE - 1] &= 127;
|
||||
c = (c >> 7) * 19;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < F25519_SIZE; i++) {
|
||||
c += r[i];
|
||||
r[i] = (uint8_t)c;
|
||||
r[i] = (unsigned char)c;
|
||||
c >>= 8;
|
||||
}
|
||||
}
|
||||
__device__ __forceinline__ void f25519_mul_c(uint8_t* __restrict__ r,
|
||||
const uint8_t* __restrict__ a, uint32_t b) {
|
||||
/*
|
||||
__device__ __forceinline__ void f25519_mul_c(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, uint32_t b) {
|
||||
uint32_t c = 0;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < F25519_SIZE; i++) {
|
||||
c = (c >> 8) + b * ((uint32_t)a[i]);
|
||||
r[i] = (uint8_t)c;
|
||||
r[i] = (unsigned char)c;
|
||||
}
|
||||
r[F25519_SIZE - 1] &= 127;
|
||||
c = (c >> 7) * 19;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < F25519_SIZE; i++) {
|
||||
c += r[i];
|
||||
r[i] = (uint8_t)c;
|
||||
r[i] = (unsigned char)c;
|
||||
c >>= 8;
|
||||
}
|
||||
}
|
||||
__device__ __forceinline__ void f25519_inv__distinct(uint8_t* __restrict__ r,
|
||||
const uint8_t* __restrict__ x) {
|
||||
uint8_t s[F25519_SIZE];
|
||||
*/
|
||||
__device__ __forceinline__ void f25519_inv__distinct(unsigned char* __restrict__ r, const unsigned char* __restrict__ x) {
|
||||
unsigned char s[F25519_SIZE];
|
||||
f25519_mul__distinct(s, x, x);
|
||||
f25519_mul__distinct(r, s, x);
|
||||
#pragma unroll
|
||||
@ -180,9 +169,8 @@ __device__ __forceinline__ void f25519_inv__distinct(uint8_t* __restrict__ r,
|
||||
f25519_mul__distinct(s, r, r);
|
||||
f25519_mul__distinct(r, s, x);
|
||||
}
|
||||
__device__ __forceinline__ void exp2523(uint8_t* __restrict__ r,
|
||||
const uint8_t* __restrict__ x,
|
||||
uint8_t* __restrict__ s) {
|
||||
/*
|
||||
__device__ __forceinline__ void exp2523(unsigned char* __restrict__ r, const unsigned char* __restrict__ x, unsigned char* __restrict__ s) {
|
||||
int i;
|
||||
f25519_mul__distinct(r, x, x);
|
||||
f25519_mul__distinct(s, r, x);
|
||||
@ -195,17 +183,16 @@ __device__ __forceinline__ void exp2523(uint8_t* __restrict__ r,
|
||||
f25519_mul__distinct(s, r, r);
|
||||
f25519_mul__distinct(r, s, x);
|
||||
}
|
||||
__device__ __forceinline__ void f25519_sqrt(uint8_t* __restrict__ r,
|
||||
const uint8_t* __restrict__ a) {
|
||||
uint8_t v[F25519_SIZE], i_val[F25519_SIZE], x[F25519_SIZE], y[F25519_SIZE];
|
||||
__device__ __forceinline__ void f25519_sqrt(unsigned char* __restrict__ r, const unsigned char* __restrict__ a) {
|
||||
unsigned char v[F25519_SIZE], i_val[F25519_SIZE], x[F25519_SIZE], y[F25519_SIZE];
|
||||
f25519_mul_c(x, a, 2);
|
||||
exp2523(v, x, y);
|
||||
f25519_mul__distinct(y, v, v);
|
||||
f25519_mul__distinct(i_val, x, y);
|
||||
uint8_t one[F25519_SIZE];
|
||||
unsigned char one[F25519_SIZE];
|
||||
f25519_load(one, 1);
|
||||
f25519_sub(i_val, i_val, one);
|
||||
f25519_mul__distinct(x, v, a);
|
||||
f25519_mul__distinct(r, x, i_val);
|
||||
}
|
||||
#endif
|
||||
*/
|
139
libs/fprime.cuh
139
libs/fprime.cuh
@ -1,44 +1,33 @@
|
||||
#ifndef __FPRIME_CUH
|
||||
#define __FPRIME_CUH
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#pragma once
|
||||
#include <string.h>
|
||||
#ifndef COMPACT_DISABLE_ED25519
|
||||
#ifdef FULL_C25519_CODE
|
||||
__device__ const uint8_t fprime_zero[FPRIME_SIZE] = { 0 };
|
||||
__device__ const uint8_t fprime_one[FPRIME_SIZE] = { 1 };
|
||||
#endif
|
||||
#define FPRIME_SIZE 32
|
||||
__device__ void raw_add(uint8_t* x, const uint8_t* p) {
|
||||
uint16_t c = 0;
|
||||
for (int i = 0; i < FPRIME_SIZE; i++) {
|
||||
c += ((uint16_t)x[i]) + ((uint16_t)p[i]);
|
||||
x[i] = (uint8_t)c;
|
||||
__device__ void raw_add(unsigned char* x, const unsigned char* p) {
|
||||
unsigned short c = 0;
|
||||
for (int i = 0; i < 32; i++) {
|
||||
c += ((unsigned short)x[i]) + ((unsigned short)p[i]);
|
||||
x[i] = (unsigned char)c;
|
||||
c >>= 8;
|
||||
}
|
||||
}
|
||||
__device__ void fprime_select(uint8_t* dst, const uint8_t* zero, const uint8_t* one, uint8_t condition) {
|
||||
const uint8_t mask = -condition;
|
||||
for (int i = 0; i < FPRIME_SIZE; i++)
|
||||
__device__ void fprime_select(unsigned char* dst, const unsigned char* zero, const unsigned char* one, unsigned char condition) {
|
||||
const unsigned char mask = -condition;
|
||||
for (int i = 0; i < 32; i++)
|
||||
dst[i] = zero[i] ^ (mask & (one[i] ^ zero[i]));
|
||||
}
|
||||
__device__ void raw_try_sub(uint8_t* x, const uint8_t* p)
|
||||
{
|
||||
uint8_t minusp[FPRIME_SIZE];
|
||||
uint16_t c = 0;
|
||||
for (int i = 0; i < FPRIME_SIZE; i++) {
|
||||
c = ((uint16_t)x[i]) - ((uint16_t)p[i]) - c;
|
||||
minusp[i] = (uint8_t)c;
|
||||
__device__ void raw_try_sub(unsigned char* x, const unsigned char* p) {
|
||||
unsigned char minusp[32];
|
||||
unsigned short c = 0;
|
||||
for (int i = 0; i < 32; i++) {
|
||||
c = ((unsigned short)x[i]) - ((unsigned short)p[i]) - c;
|
||||
minusp[i] = (unsigned char)c;
|
||||
c = (c >> 8) & 1;
|
||||
}
|
||||
fprime_select(x, minusp, x, c);
|
||||
}
|
||||
__device__ int prime_msb(const uint8_t* p) {
|
||||
__device__ int prime_msb(const unsigned char* p) {
|
||||
int i;
|
||||
uint8_t x;
|
||||
for (i = FPRIME_SIZE - 1; i >= 0; i--) {
|
||||
if (p[i])
|
||||
break;
|
||||
unsigned char x;
|
||||
for (i = 32 - 1; i >= 0; i--) {
|
||||
if (p[i]) break;
|
||||
}
|
||||
x = p[i];
|
||||
i <<= 3;
|
||||
@ -48,35 +37,23 @@ __device__ int prime_msb(const uint8_t* p) {
|
||||
}
|
||||
return i - 1;
|
||||
}
|
||||
__device__ void shift_n_bits(uint8_t* x, int n) {
|
||||
uint16_t c = 0;
|
||||
for (int i = 0; i < FPRIME_SIZE; i++) {
|
||||
c |= ((uint16_t)x[i]) << n;
|
||||
x[i] = (uint8_t)c;
|
||||
__device__ void shift_n_bits(unsigned char* x, int n) {
|
||||
unsigned short c = 0;
|
||||
for (int i = 0; i < 32; i++) {
|
||||
c |= ((unsigned short)x[i]) << n;
|
||||
x[i] = (unsigned char)c;
|
||||
c >>= 8;
|
||||
}
|
||||
}
|
||||
#ifdef FULL_C25519_CODE
|
||||
__device__ void fprime_load(uint8_t* x, uint32_t c)
|
||||
{
|
||||
unsigned int i;
|
||||
for (i = 0; i < sizeof(c); i++) {
|
||||
x[i] = (uint8_t)c;
|
||||
c >>= 8;
|
||||
}
|
||||
for (; i < FPRIME_SIZE; i++)
|
||||
x[i] = 0;
|
||||
}
|
||||
#endif
|
||||
__device__ inline int min_int(int a, int b) {
|
||||
return a < b ? a : b;
|
||||
}
|
||||
__device__ void fprime_from_bytes(uint8_t* n, const uint8_t* x, size_t len, const uint8_t* modulus) {
|
||||
__device__ void fprime_from_bytes(unsigned char* n, const unsigned char* x, unsigned long len, const unsigned char* modulus) {
|
||||
const int preload_total = min_int(prime_msb(modulus) - 1, (int)(len << 3));
|
||||
const int preload_bytes = preload_total >> 3;
|
||||
const int preload_bits = preload_total & 7;
|
||||
const int rbits = (len << 3) - preload_total;
|
||||
memset(n, 0, FPRIME_SIZE);
|
||||
memset(n, 0, 32);
|
||||
for (int i = 0; i < preload_bytes; i++)
|
||||
n[i] = x[len - preload_bytes + i];
|
||||
if (preload_bits) {
|
||||
@ -84,74 +61,28 @@ __device__ void fprime_from_bytes(uint8_t* n, const uint8_t* x, size_t len, cons
|
||||
n[0] |= x[len - preload_bytes - 1] >> (8 - preload_bits);
|
||||
}
|
||||
for (int i = rbits - 1; i >= 0; i--) {
|
||||
const uint8_t bit = (x[i >> 3] >> (i & 7)) & 1;
|
||||
const unsigned char bit = (x[i >> 3] >> (i & 7)) & 1;
|
||||
shift_n_bits(n, 1);
|
||||
n[0] |= bit;
|
||||
raw_try_sub(n, modulus);
|
||||
}
|
||||
}
|
||||
#ifdef FULL_C25519_CODE
|
||||
__device__ void fprime_normalize(uint8_t* x, const uint8_t* modulus) {
|
||||
uint8_t n[FPRIME_SIZE];
|
||||
fprime_from_bytes(n, x, FPRIME_SIZE, modulus);
|
||||
fprime_copy(x, n);
|
||||
}
|
||||
__device__ uint8_t fprime_eq(const uint8_t* x, const uint8_t* y) {
|
||||
uint8_t sum = 0;
|
||||
for (int i = 0; i < FPRIME_SIZE; i++)
|
||||
sum |= x[i] ^ y[i];
|
||||
sum |= (sum >> 4);
|
||||
sum |= (sum >> 2);
|
||||
sum |= (sum >> 1);
|
||||
return (sum ^ 1) & 1;
|
||||
}
|
||||
#endif
|
||||
__device__ void fprime_add(uint8_t* r, const uint8_t* a, const uint8_t* modulus) {
|
||||
__device__ void fprime_add(unsigned char* r, const unsigned char* a, const unsigned char* modulus) {
|
||||
raw_add(r, a);
|
||||
raw_try_sub(r, modulus);
|
||||
}
|
||||
#ifdef FULL_C25519_CODE
|
||||
__device__ void fprime_sub(uint8_t* r, const uint8_t* a, const uint8_t* modulus) {
|
||||
raw_add(r, modulus);
|
||||
raw_try_sub(r, a);
|
||||
raw_try_sub(r, modulus);
|
||||
__device__ inline void fprime_copy(unsigned char* x, const unsigned char* a) {
|
||||
memcpy(x, a, 32);
|
||||
}
|
||||
#endif
|
||||
__device__ inline void fprime_copy(uint8_t* x, const uint8_t* a) {
|
||||
memcpy(x, a, FPRIME_SIZE);
|
||||
}
|
||||
__device__ void fprime_mul(uint8_t* r, const uint8_t* a, const uint8_t* b, const uint8_t* modulus) {
|
||||
memset(r, 0, FPRIME_SIZE);
|
||||
__device__ void fprime_mul(unsigned char* r, const unsigned char* a, const unsigned char* b, const unsigned char* modulus) {
|
||||
memset(r, 0, 32);
|
||||
for (int i = prime_msb(modulus); i >= 0; i--) {
|
||||
const uint8_t bit = (b[i >> 3] >> (i & 7)) & 1;
|
||||
uint8_t plusa[FPRIME_SIZE];
|
||||
const unsigned char bit = (b[i >> 3] >> (i & 7)) & 1;
|
||||
unsigned char plusa[32];
|
||||
shift_n_bits(r, 1);
|
||||
raw_try_sub(r, modulus);
|
||||
fprime_copy(plusa, r);
|
||||
fprime_add(plusa, a, modulus);
|
||||
fprime_select(r, r, plusa, bit);
|
||||
}
|
||||
}
|
||||
#ifdef FULL_C25519_CODE
|
||||
__device__ void fprime_inv(uint8_t* r, const uint8_t* a, const uint8_t* modulus) {
|
||||
uint8_t pm2[FPRIME_SIZE];
|
||||
uint16_t c = 2;
|
||||
fprime_copy(pm2, modulus);
|
||||
for (int i = 0; i < FPRIME_SIZE; i++) {
|
||||
c = modulus[i] - c;
|
||||
pm2[i] = (uint8_t)c;
|
||||
c >>= 8;
|
||||
}
|
||||
fprime_load(r, 1);
|
||||
for (int i = prime_msb(modulus); i >= 0; i--) {
|
||||
uint8_t r2[FPRIME_SIZE];
|
||||
fprime_mul(r2, r, r, modulus);
|
||||
if ((pm2[i >> 3] >> (i & 7)) & 1)
|
||||
fprime_mul(r, r2, a, modulus);
|
||||
else
|
||||
fprime_copy(r, r2);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
}
|
@ -1,20 +1,13 @@
|
||||
#ifndef __SHA512_CUH
|
||||
#define __SHA512_CUH
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
#define SHA512_BLOCK_SIZE 128
|
||||
#pragma once
|
||||
struct sha512_state {
|
||||
uint64_t h[8];
|
||||
unsigned long h[8];
|
||||
};
|
||||
#if !defined(COMPACT_DISABLE_ED25519) || !defined(COMPACT_DISABLE_X25519_DERIVE)
|
||||
__device__ __constant__ sha512_state sha512_initial_state = { {
|
||||
0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
|
||||
0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
|
||||
0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
|
||||
0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL,
|
||||
} };
|
||||
#endif
|
||||
__device__ __constant__ uint64_t round_k[80] = {
|
||||
0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL,
|
||||
0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL,
|
||||
@ -57,26 +50,24 @@ __device__ __constant__ uint64_t round_k[80] = {
|
||||
0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
|
||||
0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL,
|
||||
};
|
||||
__device__ __forceinline__ uint64_t load64(const uint8_t* x) {
|
||||
return ((uint64_t)x[0] << 56) | ((uint64_t)x[1] << 48) |
|
||||
((uint64_t)x[2] << 40) | ((uint64_t)x[3] << 32) |
|
||||
((uint64_t)x[4] << 24) | ((uint64_t)x[5] << 16) |
|
||||
((uint64_t)x[6] << 8) | ((uint64_t)x[7]);
|
||||
__device__ __forceinline__ uint64_t load64(const unsigned char* x) {
|
||||
return ((uint64_t)x[0] << 56) | ((uint64_t)x[1] << 48) | ((uint64_t)x[2] << 40) | ((uint64_t)x[3] << 32)
|
||||
| ((uint64_t)x[4] << 24) | ((uint64_t)x[5] << 16) | ((uint64_t)x[6] << 8) | ((uint64_t)x[7]);
|
||||
}
|
||||
__device__ __forceinline__ void store64(uint8_t* x, uint64_t v) {
|
||||
x[0] = (uint8_t)(v >> 56);
|
||||
x[1] = (uint8_t)(v >> 48);
|
||||
x[2] = (uint8_t)(v >> 40);
|
||||
x[3] = (uint8_t)(v >> 32);
|
||||
x[4] = (uint8_t)(v >> 24);
|
||||
x[5] = (uint8_t)(v >> 16);
|
||||
x[6] = (uint8_t)(v >> 8);
|
||||
x[7] = (uint8_t)(v);
|
||||
__device__ __forceinline__ void store64(unsigned char* x, uint64_t v) {
|
||||
x[0] = (unsigned char)(v >> 56);
|
||||
x[1] = (unsigned char)(v >> 48);
|
||||
x[2] = (unsigned char)(v >> 40);
|
||||
x[3] = (unsigned char)(v >> 32);
|
||||
x[4] = (unsigned char)(v >> 24);
|
||||
x[5] = (unsigned char)(v >> 16);
|
||||
x[6] = (unsigned char)(v >> 8);
|
||||
x[7] = (unsigned char)(v);
|
||||
}
|
||||
__device__ __forceinline__ uint64_t rot64(uint64_t x, int bits) {
|
||||
return (x >> bits) | (x << (64 - bits));
|
||||
}
|
||||
__device__ void sha512_block(sha512_state* s, const uint8_t* blk) {
|
||||
__device__ void sha512_block(sha512_state* s, const unsigned char* blk) {
|
||||
uint64_t w[16];
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 16; i++) {
|
||||
@ -112,7 +103,6 @@ __device__ void sha512_block(sha512_state* s, const uint8_t* blk) {
|
||||
c = b;
|
||||
b = a;
|
||||
a = temp1 + temp2;
|
||||
|
||||
w[idx] += s0 + w[idx7] + s1;
|
||||
}
|
||||
s->h[0] += a;
|
||||
@ -124,31 +114,30 @@ __device__ void sha512_block(sha512_state* s, const uint8_t* blk) {
|
||||
s->h[6] += g;
|
||||
s->h[7] += h;
|
||||
}
|
||||
__device__ void sha512_final(sha512_state* s, const uint8_t* blk, size_t total_size) {
|
||||
uint8_t temp[SHA512_BLOCK_SIZE];
|
||||
__device__ void sha512_final(sha512_state* s, const unsigned char* blk, unsigned long total_size) {
|
||||
unsigned char temp[128];
|
||||
memset(temp, 0, sizeof(temp));
|
||||
|
||||
size_t last_size = total_size & (SHA512_BLOCK_SIZE - 1);
|
||||
unsigned long last_size = total_size & (128 - 1);
|
||||
if (last_size) {
|
||||
memcpy(temp, blk, last_size);
|
||||
}
|
||||
temp[last_size] = 0x80;
|
||||
if (last_size > (SHA512_BLOCK_SIZE - 9)) {
|
||||
if (last_size > (128 - 9)) {
|
||||
sha512_block(s, temp);
|
||||
memset(temp, 0, sizeof(temp));
|
||||
}
|
||||
store64(temp + SHA512_BLOCK_SIZE - 8, total_size << 3);
|
||||
store64(temp + 128 - 8, total_size << 3);
|
||||
sha512_block(s, temp);
|
||||
}
|
||||
__device__ void sha512_get(const sha512_state* s, uint8_t* hash, unsigned int offset, unsigned int len) {
|
||||
if (offset > SHA512_BLOCK_SIZE)
|
||||
__device__ void sha512_get(const sha512_state* s, unsigned char* hash, unsigned int offset, unsigned int len) {
|
||||
if (offset > 128)
|
||||
return;
|
||||
if (len > SHA512_BLOCK_SIZE - offset)
|
||||
len = SHA512_BLOCK_SIZE - offset;
|
||||
if (len > 128 - offset)
|
||||
len = 128 - offset;
|
||||
unsigned int i = offset >> 3;
|
||||
unsigned int off = offset & 7;
|
||||
if (off) {
|
||||
uint8_t tmp[8];
|
||||
unsigned char tmp[8];
|
||||
store64(tmp, s->h[i]);
|
||||
unsigned int c = 8 - off;
|
||||
if (c > len) c = len;
|
||||
@ -164,12 +153,11 @@ __device__ void sha512_get(const sha512_state* s, uint8_t* hash, unsigned int of
|
||||
i++;
|
||||
}
|
||||
if (len) {
|
||||
uint8_t tmp[8];
|
||||
unsigned char tmp[8];
|
||||
store64(tmp, s->h[i]);
|
||||
memcpy(hash, tmp, len);
|
||||
}
|
||||
}
|
||||
__device__ void sha512_init(struct sha512_state* s) {
|
||||
memcpy(s, &sha512_initial_state, sizeof(*s));
|
||||
}
|
||||
#endif
|
||||
}
|
@ -46,6 +46,7 @@ void displayConfig() {
|
||||
unsigned processor_count = std::thread::hardware_concurrency();
|
||||
if (conf.proc == 0 || conf.proc > static_cast<unsigned>(processor_count)) {
|
||||
conf.proc = static_cast<unsigned>(processor_count);
|
||||
|
||||
}
|
||||
printf("Threads: %u, high addresses (2%02x+)\n", conf.proc, conf.high.load());
|
||||
}
|
||||
|
@ -4,22 +4,19 @@
|
||||
#include <cstdint>
|
||||
#include <cuda_runtime.h>
|
||||
#include <curand_kernel.h>
|
||||
#include <arpa/inet.h>
|
||||
#include "../libs/sha512.cuh"
|
||||
#include <arpa/inet.h>
|
||||
#include "../libs/ed25519.cuh"
|
||||
#include "../libs/edsign.cuh"
|
||||
#define MAX_RESULTS 1024
|
||||
__constant__ char hexDigitsConst[17] = "0123456789abcdef";
|
||||
__device__ __constant__ char hexDigitsConst[17] = "0123456789abcdef";
|
||||
using Address = unsigned char[16];
|
||||
using Key = unsigned char[32];
|
||||
|
||||
struct KeysBox {
|
||||
Key PublicKey;
|
||||
Key PrivateKey;
|
||||
};
|
||||
struct option {
|
||||
unsigned proc = 0;
|
||||
__device__ __managed__ unsigned high = 0x10;
|
||||
unsigned high = 0x10;
|
||||
};
|
||||
__device__ static option conf;
|
||||
struct ds64 {
|
||||
@ -28,13 +25,9 @@ struct ds64 {
|
||||
struct ds46 {
|
||||
char data[46];
|
||||
};
|
||||
__host__ __device__ ds64 KeyToString(const unsigned char* key) noexcept {
|
||||
__device__ ds64 KeyToString(const unsigned char* key) noexcept {
|
||||
ds64 str;
|
||||
#ifdef __CUDA_ARCH__
|
||||
const char* hexDigits = hexDigitsConst;
|
||||
#else
|
||||
const char* hexDigits = "0123456789abcdef";
|
||||
#endif
|
||||
for (unsigned char i = 0; i < 32; i++) {
|
||||
str.data[2 * i] = hexDigits[key[i] >> 4];
|
||||
str.data[2 * i + 1] = hexDigits[key[i] & 0x0F];
|
||||
@ -69,10 +62,7 @@ __device__ void getRawAddress(int lErase, Key& InvertedPublicKey, Address& rawAd
|
||||
const int start = lErase / 8;
|
||||
if (bitsToShift != 0) {
|
||||
for (int i = start; i < start + 15; i++) {
|
||||
InvertedPublicKey[i] = static_cast<unsigned char>(
|
||||
(InvertedPublicKey[i] << bitsToShift) |
|
||||
(InvertedPublicKey[i + 1] >> (8 - bitsToShift))
|
||||
);
|
||||
InvertedPublicKey[i] = static_cast<unsigned char>((InvertedPublicKey[i] << bitsToShift) | (InvertedPublicKey[i + 1] >> (8 - bitsToShift)));
|
||||
}
|
||||
}
|
||||
rawAddr[0] = 0x02;
|
||||
@ -101,10 +91,7 @@ __device__ unsigned char zeroCounter(unsigned int x) {
|
||||
__device__ unsigned char getZeros(const unsigned char* v) {
|
||||
unsigned char leadZeros = 0;
|
||||
for (int i = 0; i < 32; i += 4) {
|
||||
unsigned int word = (static_cast<unsigned int>(v[i]) << 24) |
|
||||
(static_cast<unsigned int>(v[i + 1]) << 16) |
|
||||
(static_cast<unsigned int>(v[i + 2]) << 8) |
|
||||
(static_cast<unsigned int>(v[i + 3]));
|
||||
unsigned word = (static_cast<unsigned>(v[i]) << 24) | (static_cast<unsigned>(v[i + 1]) << 16) | (static_cast<unsigned>(v[i + 2]) << 8) | (static_cast<unsigned>(v[i + 3]));
|
||||
if (word == 0)
|
||||
leadZeros += 32;
|
||||
else {
|
||||
@ -118,8 +105,8 @@ __global__ void initRandStates(curandState* randStates) {
|
||||
int id = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
curand_init((unsigned long long)clock64() + id, id, 0, &randStates[id]);
|
||||
}
|
||||
__device__ void generateRandomBytes(uint8_t* buf, size_t size, curandState* state) {
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
__device__ void generateRandomBytes(unsigned char* buf, unsigned long size, curandState* state) {
|
||||
for (unsigned long i = 0; i < size; i++) {
|
||||
buf[i] = curand(state) & 0xFF;
|
||||
}
|
||||
}
|
||||
@ -127,65 +114,37 @@ __device__ void invertKey(const unsigned char* key, unsigned char* inverted) {
|
||||
for (int i = 0; i < 32; i++)
|
||||
inverted[i] = key[i] ^ 0xFF;
|
||||
}
|
||||
__device__ void compact_wipe(void* data, size_t length) {
|
||||
volatile uint8_t* p = (volatile uint8_t*)data;
|
||||
__device__ void compact_wipe(void* data, unsigned long length) {
|
||||
volatile unsigned char* p = (volatile unsigned char*)data;
|
||||
while (length--) {
|
||||
*p++ = 0;
|
||||
}
|
||||
}
|
||||
__device__ void ed25519_keygen(uint8_t private_key[64], uint8_t public_key[32], uint8_t random_seed[32]) {
|
||||
__device__ void ed25519_keygen(unsigned char private_key[64], unsigned char public_key[32], unsigned char random_seed[32]) {
|
||||
edsign_sec_to_pub(public_key, random_seed);
|
||||
memcpy(private_key, random_seed, 32);
|
||||
memcpy(private_key + 32, public_key, 32);
|
||||
compact_wipe(random_seed, 32);
|
||||
}
|
||||
struct Result {
|
||||
char ipv6[46];
|
||||
char pk[65];
|
||||
char sk[65];
|
||||
};
|
||||
__device__ __managed__ Result resultBuffer[MAX_RESULTS];
|
||||
__device__ __managed__ int resultCount = 0;
|
||||
__global__ __launch_bounds__(256) void minerKernel(curandState* randStates) {
|
||||
int tid_global = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
curandState localState = randStates[tid_global];
|
||||
uint8_t seed[32];
|
||||
int thid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
curandState localState = randStates[thid];
|
||||
Key seed;
|
||||
generateRandomBytes(seed, sizeof(seed), &localState);
|
||||
if (tid_global == 0) {
|
||||
printf("Seed: %s\n", KeyToString(seed).data);
|
||||
}
|
||||
while (true) {
|
||||
generateRandomBytes(seed, sizeof(seed), &localState);
|
||||
KeysBox keys;
|
||||
ed25519_keygen(keys.PrivateKey, keys.PublicKey, seed);
|
||||
int zeros = getZeros(keys.PublicKey);
|
||||
unsigned oldHigh = atomicMax(&conf.high, (unsigned)zeros);
|
||||
if (zeros > oldHigh) {
|
||||
if (unsigned zeros = getZeros(keys.PublicKey); zeros > atomicMax(&conf.high, (unsigned)zeros)) {
|
||||
Key inv;
|
||||
Address rawAddr_local;
|
||||
Address raw;
|
||||
invertKey(keys.PublicKey, inv);
|
||||
getRawAddress(zeros, inv, rawAddr_local);
|
||||
ds46 addrStr = getAddress(rawAddr_local);
|
||||
ds64 pkStr = KeyToString(keys.PublicKey);
|
||||
ds64 skStr = KeyToString(keys.PrivateKey);
|
||||
int idx = atomicAdd(&resultCount, 1);
|
||||
if (idx < MAX_RESULTS) {
|
||||
memcpy(resultBuffer[idx].ipv6, addrStr.data, sizeof(addrStr.data));
|
||||
memcpy(resultBuffer[idx].pk, pkStr.data, sizeof(pkStr.data));
|
||||
memcpy(resultBuffer[idx].sk, skStr.data, sizeof(skStr.data));
|
||||
}
|
||||
}
|
||||
if (tid_global == 0) {
|
||||
if (resultCount > 0) {
|
||||
for (int i = 0; i < resultCount; i++) {
|
||||
printf("\nIPv6:\t%s\nPK:\t%s\nSK:\t%s\n", resultBuffer[i].ipv6, resultBuffer[i].pk, resultBuffer[i].sk);
|
||||
}
|
||||
resultCount = 0;
|
||||
}
|
||||
getRawAddress(zeros, inv, raw);
|
||||
printf("\nIPv6:\t%s\nPK:\t%s\nSK:\t%s\n", getAddress(raw).data, KeyToString(keys.PublicKey).data, KeyToString(keys.PrivateKey).data);
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
randStates[tid_global] = localState;
|
||||
//randStates[thid] = localState;
|
||||
}
|
||||
int main() {
|
||||
curandState* d_randStates;
|
||||
|
Loading…
x
Reference in New Issue
Block a user