This commit is contained in:
rcxpony
2025-03-20 02:38:01 +05:00
parent 3cb19e527e
commit d01662bd63
6 changed files with 66 additions and 286 deletions

View File

@@ -1,5 +1,5 @@
NVCC := nvcc
NVCC_FLAGS := -rdc=true -O3 -Xptxas -O3 -Xcompiler -O3 \
NVCC_FLAGS := -rdc=true -O4 -Xptxas -O4 \
-use_fast_math -ftz=true -prec-div=false -prec-sqrt=false \
-gencode arch=compute_75,code=sm_75 \
--default-stream per-thread \

View File

@@ -1,7 +1,7 @@
#include <ed25519.cuh>
#include <precomp_data.h>
#include <f25519.cuh>
void __forceinline__ __host__ __device__ ge_madd(ge_p1p1* __restrict__ r, const ge_p3* __restrict__ p, const precomp_data* __restrict__ q) {
void __host__ __device__ ge_madd(ge_p1p1* __restrict__ r, const ge_p3* __restrict__ p, const precomp_data* __restrict__ q) {
fe t0;
fe_add(r->X, p->Y, p->X);
fe_sub(r->Y, p->Y, p->X);
@@ -52,7 +52,7 @@ void __host__ __device__ ge_p3_tobytes(unsigned char* __restrict__ s, const ge_p
fe_tobytes(s, y);
s[31] ^= fe_isnegative(x) << 7;
}
static unsigned char __forceinline__ __host__ __device__ equal(const signed char b, const signed char c) {
static unsigned char __host__ __device__ equal(const signed char b, const signed char c) {
unsigned long x = (b ^ c) - 1;
return static_cast<unsigned char>(x >>= 63);
}
@@ -61,7 +61,7 @@ static void __host__ __device__ cmov(precomp_data* __restrict__ t, const precomp
fe_cmov(t->yminusx, u->yminusx, b);
fe_cmov(t->xy2d, u->xy2d, b);
}
static void __forceinline__ __host__ __device__ select(precomp_data* __restrict__ t, int pos, signed char b) {
static void __host__ __device__ select(precomp_data* __restrict__ t, int pos, signed char b) {
precomp_data minust;
unsigned long x = b;
x >>= 63;
@@ -111,7 +111,7 @@ void __device__ __host__ ge_scalarmult_base(ge_p3* __restrict__ h, const unsigne
fe_1(h->Y);
fe_1(h->Z);
fe_0(h->T);
#pragma unroll
#pragma unroll 31
for (unsigned char i = 1; i < 64; i += 2) {
select(&t, i >> 1, e[i]);
ge_madd(&r, h, &t);

View File

@@ -2,7 +2,7 @@
#include <ed25519.cuh>
#include <sha512.cuh>
#include <ed25519.cuh>
__device__ __forceinline__ void expand_key(unsigned char* __restrict__ expanded, const unsigned char* __restrict__ secret) {
__device__ void expand_key(unsigned char* __restrict__ expanded, const unsigned char* __restrict__ secret) {
struct sha512_state s;
memcpy(&s, &sha512_initial_state, sizeof(s));
sha512_final(&s, secret);

View File

@@ -24,36 +24,9 @@ void __device__ __host__ fe_1(fe& __restrict__ h) {
h[9] = 0;
}
void __device__ __host__ fe_add(fe& __restrict__ h, const fe& __restrict__ f, const fe& __restrict__ g) {
signed int f0 = f[0];
signed int f1 = f[1];
signed int f2 = f[2];
signed int f3 = f[3];
signed int f4 = f[4];
signed int f5 = f[5];
signed int f6 = f[6];
signed int f7 = f[7];
signed int f8 = f[8];
signed int f9 = f[9];
signed int g0 = g[0];
signed int g1 = g[1];
signed int g2 = g[2];
signed int g3 = g[3];
signed int g4 = g[4];
signed int g5 = g[5];
signed int g6 = g[6];
signed int g7 = g[7];
signed int g8 = g[8];
signed int g9 = g[9];
signed int h0 = f0 + g0;
signed int h1 = f1 + g1;
signed int h2 = f2 + g2;
signed int h3 = f3 + g3;
signed int h4 = f4 + g4;
signed int h5 = f5 + g5;
signed int h6 = f6 + g6;
signed int h7 = f7 + g7;
signed int h8 = f8 + g8;
signed int h9 = f9 + g9;
const signed int f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4], f5 = f[5], f6 = f[6], f7 = f[7], f8 = f[8], f9 = f[9];
const signed int g0 = g[0], g1 = g[1], g2 = g[2], g3 = g[3], g4 = g[4], g5 = g[5], g6 = g[6], g7 = g[7], g8 = g[8], g9 = g[9];
const signed int h0 = f0 + g0, h1 = f1 + g1, h2 = f2 + g2, h3 = f3 + g3, h4 = f4 + g4, h5 = f5 + g5, h6 = f6 + g6, h7 = f7 + g7, h8 = f8 + g8, h9 = f9 + g9;
h[0] = h0;
h[1] = h1;
h[2] = h2;
@@ -66,47 +39,10 @@ void __device__ __host__ fe_add(fe& __restrict__ h, const fe& __restrict__ f, co
h[9] = h9;
}
void __device__ __host__ fe_cmov(fe& __restrict__ f, const fe& __restrict__ g, unsigned b) {
unsigned bb = ~b + 1;
signed int f0 = f[0];
signed int f1 = f[1];
signed int f2 = f[2];
signed int f3 = f[3];
signed int f4 = f[4];
signed int f5 = f[5];
signed int f6 = f[6];
signed int f7 = f[7];
signed int f8 = f[8];
signed int f9 = f[9];
signed int g0 = g[0];
signed int g1 = g[1];
signed int g2 = g[2];
signed int g3 = g[3];
signed int g4 = g[4];
signed int g5 = g[5];
signed int g6 = g[6];
signed int g7 = g[7];
signed int g8 = g[8];
signed int g9 = g[9];
signed int x0 = f0 ^ g0;
signed int x1 = f1 ^ g1;
signed int x2 = f2 ^ g2;
signed int x3 = f3 ^ g3;
signed int x4 = f4 ^ g4;
signed int x5 = f5 ^ g5;
signed int x6 = f6 ^ g6;
signed int x7 = f7 ^ g7;
signed int x8 = f8 ^ g8;
signed int x9 = f9 ^ g9;
x0 &= bb;
x1 &= bb;
x2 &= bb;
x3 &= bb;
x4 &= bb;
x5 &= bb;
x6 &= bb;
x7 &= bb;
x8 &= bb;
x9 &= bb;
const signed int x = -b;
const signed int f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4], f5 = f[5], f6 = f[6], f7 = f[7], f8 = f[8], f9 = f[9];
const signed int g0 = g[0], g1 = g[1], g2 = g[2], g3 = g[3], g4 = g[4], g5 = g[5], g6 = g[6], g7 = g[7], g8 = g[8], g9 = g[9];
const signed int x0 = (f0 ^ g0) & x, x1 = (f1 ^ g1) & x, x2 = (f2 ^ g2) & x, x3 = (f3 ^ g3) & x, x4 = (f4 ^ g4) & x, x5 = (f5 ^ g5) & x, x6 = (f6 ^ g6) & x, x7 = (f7 ^ g7) & x, x8 = (f8 ^ g8) & x, x9 = (f9 ^ g9) & x;
f[0] = f0 ^ x0;
f[1] = f1 ^ x1;
f[2] = f2 ^ x2;
@@ -119,16 +55,7 @@ void __device__ __host__ fe_cmov(fe& __restrict__ f, const fe& __restrict__ g, u
f[9] = f9 ^ x9;
}
void __device__ __host__ fe_copy(fe& __restrict__ h, const fe& __restrict__ f) {
signed int f0 = f[0];
signed int f1 = f[1];
signed int f2 = f[2];
signed int f3 = f[3];
signed int f4 = f[4];
signed int f5 = f[5];
signed int f6 = f[6];
signed int f7 = f[7];
signed int f8 = f[8];
signed int f9 = f[9];
const signed int f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4], f5 = f[5], f6 = f[6], f7 = f[7], f8 = f[8], f9 = f[9];
h[0] = f0;
h[1] = f1;
h[2] = f2;
@@ -153,10 +80,10 @@ void __device__ __host__ fe_invert(fe& __restrict__ out, const fe& __restrict__
fe_sq(t2, t0);
fe_mul(t1, t1, t2);
fe_sq(t2, t1);
#pragma unroll 4
for (unsigned char i = 1; i < 5; ++i) {
fe_sq(t2, t2);
}
fe_sq(t2, t2);
fe_sq(t2, t2);
fe_sq(t2, t2);
fe_sq(t2, t2);
fe_mul(t1, t2, t1);
fe_sq(t2, t1);
#pragma unroll 9
@@ -253,26 +180,8 @@ void __device__ __host__ fe_mul(fe& __restrict__ h, const fe& __restrict__ f, co
h[9] = static_cast<int>(h9);
}
void __host__ __device__ fe_neg(fe& __restrict__ h, const fe& __restrict__ f) {
signed int f0 = f[0];
signed int f1 = f[1];
signed int f2 = f[2];
signed int f3 = f[3];
signed int f4 = f[4];
signed int f5 = f[5];
signed int f6 = f[6];
signed int f7 = f[7];
signed int f8 = f[8];
signed int f9 = f[9];
signed int h0 = -f0;
signed int h1 = -f1;
signed int h2 = -f2;
signed int h3 = -f3;
signed int h4 = -f4;
signed int h5 = -f5;
signed int h6 = -f6;
signed int h7 = -f7;
signed int h8 = -f8;
signed int h9 = -f9;
const signed long f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4], f5 = f[5], f6 = f[6], f7 = f[7], f8 = f[8], f9 = f[9];
const signed int h0 = -f0, h1 = -f1, h2 = -f2, h3 = -f3, h4 = -f4, h5 = -f5, h6 = -f6, h7 = -f7, h8 = -f8, h9 = -f9;
h[0] = h0;
h[1] = h1;
h[2] = h2;
@@ -285,84 +194,19 @@ void __host__ __device__ fe_neg(fe& __restrict__ h, const fe& __restrict__ f) {
h[9] = h9;
}
void __host__ __device__ fe_sq2(fe& __restrict__ h, const fe& __restrict__ f) {
int f0 = f[0];
int f1 = f[1];
int f2 = f[2];
int f3 = f[3];
int f4 = f[4];
int f5 = f[5];
int f6 = f[6];
int f7 = f[7];
int f8 = f[8];
int f9 = f[9];
int f0_2 = f0 << 1;
int f1_2 = f1 << 1;
int f2_2 = f2 << 1;
int f3_2 = f3 << 1;
int f4_2 = f4 << 1;
int f5_2 = f5 << 1;
int f6_2 = f6 << 1;
int f7_2 = f7 << 1;
int f5_38 = 38 * f5;
int f6_19 = 19 * f6;
int f7_38 = 38 * f7;
int f8_19 = 19 * f8;
int f9_38 = 38 * f9;
long f0f0 = f0 * static_cast<long>(f0);
long f0f1_2 = f0_2 * static_cast<long>(f1);
long f0f2_2 = f0_2 * static_cast<long>(f2);
long f0f3_2 = f0_2 * static_cast<long>(f3);
long f0f4_2 = f0_2 * static_cast<long>(f4);
long f0f5_2 = f0_2 * static_cast<long>(f5);
long f0f6_2 = f0_2 * static_cast<long>(f6);
long f0f7_2 = f0_2 * static_cast<long>(f7);
long f0f8_2 = f0_2 * static_cast<long>(f8);
long f0f9_2 = f0_2 * static_cast<long>(f9);
long f1f1_2 = f1_2 * static_cast<long>(f1);
long f1f2_2 = f1_2 * static_cast<long>(f2);
long f1f3_4 = f1_2 * static_cast<long>(f3_2);
long f1f4_2 = f1_2 * static_cast<long>(f4);
long f1f5_4 = f1_2 * static_cast<long>(f5_2);
long f1f6_2 = f1_2 * static_cast<long>(f6);
long f1f7_4 = f1_2 * static_cast<long>(f7_2);
long f1f8_2 = f1_2 * static_cast<long>(f8);
long f1f9_76 = f1_2 * static_cast<long>(f9_38);
long f2f2 = f2 * static_cast<long>(f2);
long f2f3_2 = f2_2 * static_cast<long>(f3);
long f2f4_2 = f2_2 * static_cast<long>(f4);
long f2f5_2 = f2_2 * static_cast<long>(f5);
long f2f6_2 = f2_2 * static_cast<long>(f6);
long f2f7_2 = f2_2 * static_cast<long>(f7);
long f2f8_38 = f2_2 * static_cast<long>(f8_19);
long f2f9_38 = f2 * static_cast<long>(f9_38);
long f3f3_2 = f3_2 * static_cast<long>(f3);
long f3f4_2 = f3_2 * static_cast<long>(f4);
long f3f5_4 = f3_2 * static_cast<long>(f5_2);
long f3f6_2 = f3_2 * static_cast<long>(f6);
long f3f7_76 = f3_2 * static_cast<long>(f7_38);
long f3f8_38 = f3_2 * static_cast<long>(f8_19);
long f3f9_76 = f3_2 * static_cast<long>(f9_38);
long f4f4 = f4 * static_cast<long>(f4);
long f4f5_2 = f4_2 * static_cast<long>(f5);
long f4f6_38 = f4_2 * static_cast<long>(f6_19);
long f4f7_38 = f4 * static_cast<long>(f7_38);
long f4f8_38 = f4_2 * static_cast<long>(f8_19);
long f4f9_38 = f4 * static_cast<long>(f9_38);
long f5f5_38 = f5 * static_cast<long>(f5_38);
long f5f6_38 = f5_2 * static_cast<long>(f6_19);
long f5f7_76 = f5_2 * static_cast<long>(f7_38);
long f5f8_38 = f5_2 * static_cast<long>(f8_19);
long f5f9_76 = f5_2 * static_cast<long>(f9_38);
long f6f6_19 = f6 * static_cast<long>(f6_19);
long f6f7_38 = f6 * static_cast<long>(f7_38);
long f6f8_38 = f6_2 * static_cast<long>(f8_19);
long f6f9_38 = f6 * static_cast<long>(f9_38);
long f7f7_38 = f7 * static_cast<long>(f7_38);
long f7f8_38 = f7_2 * static_cast<long>(f8_19);
long f7f9_76 = f7_2 * static_cast<long>(f9_38);
long f8f8_19 = f8 * static_cast<long>(f8_19);
long f8f9_38 = f8 * static_cast<long>(f9_38);
long f9f9_38 = f9 * static_cast<long>(f9_38);
const signed long f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4], f5 = f[5], f6 = f[6], f7 = f[7], f8 = f[8], f9 = f[9];
const signed long f0_2 = f0 << 1, f1_2 = f1 << 1, f2_2 = f2 << 1, f3_2 = f3 << 1, f4_2 = f4 << 1, f5_2 = f5 << 1, f6_2 = f6 << 1, f7_2 = f7 << 1;
const signed int f5_38 = 38 * f5, f6_19 = 19 * f6, f7_38 = 38 * f7, f8_19 = 19 * f8, f9_38 = 38 * f9;
const long f0f0 = f0 * f0, f0f1_2 = f0_2 * f1, f0f2_2 = f0_2 * f2, f0f3_2 = f0_2 * f3, f0f4_2 = f0_2 * f4, f0f5_2 = f0_2 * f5, f0f6_2 = f0_2 * f6, f0f7_2 = f0_2 * f7, f0f8_2 = f0_2 * f8, f0f9_2 = f0_2 * f9;
const long f1f1_2 = f1_2 * f1, f1f2_2 = f1_2 * f2, f1f3_4 = f1_2 * f3_2, f1f4_2 = f1_2 * f4, f1f5_4 = f1_2 * f5_2, f1f6_2 = f1_2 * f6, f1f7_4 = f1_2 * f7_2, f1f8_2 = f1_2 * f8, f1f9_76 = f1_2 * f9_38;
const long f2f2 = f2 * f2, f2f3_2 = f2_2 * f3, f2f4_2 = f2_2 * f4, f2f5_2 = f2_2 * f5, f2f6_2 = f2_2 * f6, f2f7_2 = f2_2 * f7, f2f8_38 = f2_2 * f8_19, f2f9_38 = f2 * f9_38;
const long f3f3_2 = f3_2 * f3, f3f4_2 = f3_2 * f4, f3f5_4 = f3_2 * f5_2, f3f6_2 = f3_2 * f6, f3f7_76 = f3_2 * f7_38, f3f8_38 = f3_2 * f8_19, f3f9_76 = f3_2 * f9_38;
const long f4f4 = f4 * f4, f4f5_2 = f4_2 * f5, f4f6_38 = f4_2 * f6_19, f4f7_38 = f4 * f7_38, f4f8_38 = f4_2 * f8_19, f4f9_38 = f4 * f9_38;
const long f5f5_38 = f5 * f5_38, f5f6_38 = f5_2 * f6_19, f5f7_76 = f5_2 * f7_38, f5f8_38 = f5_2 * f8_19, f5f9_76 = f5_2 * f9_38;
const long f6f6_19 = f6 * f6_19, f6f7_38 = f6 * f7_38, f6f8_38 = f6_2 * f8_19, f6f9_38 = f6 * f9_38;
const long f7f7_38 = f7 * f7_38, f7f8_38 = f7_2 * f8_19, f7f9_76 = f7_2 * f9_38;
const long f8f8_19 = f8 * f8_19, f8f9_38 = f8 * f9_38;
const long f9f9_38 = f9 * f9_38;
long h0 = f0f0 + f1f9_76 + f2f8_38 + f3f7_76 + f4f6_38 + f5f5_38;
long h1 = f0f1_2 + f2f9_38 + f3f8_38 + f4f7_38 + f5f6_38;
long h2 = f0f2_2 + f1f1_2 + f3f9_76 + f4f8_38 + f5f7_76 + f6f6_19;
@@ -373,7 +217,6 @@ void __host__ __device__ fe_sq2(fe& __restrict__ h, const fe& __restrict__ f) {
long h7 = f0f7_2 + f1f6_2 + f2f5_2 + f3f4_2 + f8f9_38;
long h8 = f0f8_2 + f1f7_4 + f2f6_2 + f3f5_4 + f4f4 + f9f9_38;
long h9 = f0f9_2 + f1f8_2 + f2f7_2 + f3f6_2 + f4f5_2;
long carry0, carry1, carry2, carry3, carry4, carry5, carry6, carry7, carry8, carry9;
h0 <<= 1;
h1 <<= 1;
h2 <<= 1;
@@ -384,84 +227,33 @@ void __host__ __device__ fe_sq2(fe& __restrict__ h, const fe& __restrict__ f) {
h7 <<= 1;
h8 <<= 1;
h9 <<= 1;
carry0 = (h0 + static_cast<long>(1 << 25)) >> 26;
h1 += carry0;
h0 -= carry0 << 26;
carry4 = (h4 + static_cast<long>(1 << 25)) >> 26;
h5 += carry4;
h4 -= carry4 << 26;
carry1 = (h1 + static_cast<long>(1 << 24)) >> 25;
h2 += carry1;
h1 -= carry1 << 25;
carry5 = (h5 + static_cast<long>(1 << 24)) >> 25;
h6 += carry5;
h5 -= carry5 << 25;
carry2 = (h2 + static_cast<long>(1 << 25)) >> 26;
h3 += carry2;
h2 -= carry2 << 26;
carry6 = (h6 + static_cast<long>(1 << 25)) >> 26;
h7 += carry6;
h6 -= carry6 << 26;
carry3 = (h3 + static_cast<long>(1 << 24)) >> 25;
h4 += carry3;
h3 -= carry3 << 25;
carry7 = (h7 + static_cast<long>(1 << 24)) >> 25;
h8 += carry7;
h7 -= carry7 << 25;
carry4 = (h4 + static_cast<long>(1 << 25)) >> 26;
h5 += carry4;
h4 -= carry4 << 26;
carry8 = (h8 + static_cast<long>(1 << 25)) >> 26;
h9 += carry8;
h8 -= carry8 << 26;
carry9 = (h9 + static_cast<long>(1 << 24)) >> 25;
h0 += carry9 * 19;
h9 -= carry9 << 25;
carry0 = (h0 + static_cast<long>(1 << 25)) >> 26;
h1 += carry0;
h0 -= carry0 << 26;
h[0] = (int)h0;
h[1] = (int)h1;
h[2] = (int)h2;
h[3] = (int)h3;
h[4] = (int)h4;
h[5] = (int)h5;
h[6] = (int)h6;
h[7] = (int)h7;
h[8] = (int)h8;
h[9] = (int)h9;
signed long carry = (h0 + 33554432L) >> 26L; h1 += carry; h0 -= carry << 26L;
carry = (h4 + 33554432L) >> 26L; h5 += carry; h4 -= carry << 26L;
carry = (h1 + 16777216L) >> 25L; h2 += carry; h1 -= carry << 25L;
carry = (h5 + 16777216L) >> 25L; h6 += carry; h5 -= carry << 25L;
carry = (h2 + 33554432L) >> 26L; h3 += carry; h2 -= carry << 26L;
carry = (h6 + 33554432L) >> 26L; h7 += carry; h6 -= carry << 26L;
carry = (h3 + 16777216L) >> 25L; h4 += carry; h3 -= carry << 25L;
carry = (h7 + 16777216L) >> 25L; h8 += carry; h7 -= carry << 25L;
carry = (h4 + 33554432L) >> 26L; h5 += carry; h4 -= carry << 26L;
carry = (h8 + 33554432L) >> 26L; h9 += carry; h8 -= carry << 26L;
carry = (h9 + 16777216L) >> 25L; h0 += carry * 19L; h9 -= carry << 25L;
carry = (h0 + 33554432L) >> 26L; h1 += carry; h0 -= carry << 26L;
h[0] = static_cast<int>(h0);
h[1] = static_cast<int>(h1);
h[2] = static_cast<int>(h2);
h[3] = static_cast<int>(h3);
h[4] = static_cast<int>(h4);
h[5] = static_cast<int>(h5);
h[6] = static_cast<int>(h6);
h[7] = static_cast<int>(h7);
h[8] = static_cast<int>(h8);
h[9] = static_cast<int>(h9);
}
void __device__ __host__ fe_sub(fe& __restrict__ h, const fe& __restrict__ f, const fe& __restrict__ g) {
signed int f0 = f[0];
signed int f1 = f[1];
signed int f2 = f[2];
signed int f3 = f[3];
signed int f4 = f[4];
signed int f5 = f[5];
signed int f6 = f[6];
signed int f7 = f[7];
signed int f8 = f[8];
signed int f9 = f[9];
signed int g0 = g[0];
signed int g1 = g[1];
signed int g2 = g[2];
signed int g3 = g[3];
signed int g4 = g[4];
signed int g5 = g[5];
signed int g6 = g[6];
signed int g7 = g[7];
signed int g8 = g[8];
signed int g9 = g[9];
signed int h0 = f0 - g0;
signed int h1 = f1 - g1;
signed int h2 = f2 - g2;
signed int h3 = f3 - g3;
signed int h4 = f4 - g4;
signed int h5 = f5 - g5;
signed int h6 = f6 - g6;
signed int h7 = f7 - g7;
signed int h8 = f8 - g8;
signed int h9 = f9 - g9;
const signed int f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4], f5 = f[5], f6 = f[6], f7 = f[7], f8 = f[8], f9 = f[9];
const signed int g0 = g[0], g1 = g[1], g2 = g[2], g3 = g[3], g4 = g[4], g5 = g[5], g6 = g[6], g7 = g[7], g8 = g[8], g9 = g[9];
const signed int h0 = f0 - g0, h1 = f1 - g1, h2 = f2 - g2, h3 = f3 - g3, h4 = f4 - g4, h5 = f5 - g5, h6 = f6 - g6, h7 = f7 - g7, h8 = f8 - g8, h9 = f9 - g9;
h[0] = h0;
h[1] = h1;
h[2] = h2;
@@ -474,8 +266,7 @@ void __device__ __host__ fe_sub(fe& __restrict__ h, const fe& __restrict__ f, co
h[9] = h9;
}
void __device__ __host__ fe_tobytes(unsigned char* __restrict__ s, const fe& __restrict__ h) {
int h0 = h[0], h1 = h[1], h2 = h[2], h3 = h[3], h4 = h[4], h5 = h[5], h6 = h[6], h7 = h[7], h8 = h[8], h9 = h[9], q;
int carry0, carry1, carry2, carry3, carry4, carry5, carry6, carry7, carry8, carry9;
signed int h0 = h[0], h1 = h[1], h2 = h[2], h3 = h[3], h4 = h[4], h5 = h[5], h6 = h[6], h7 = h[7], h8 = h[8], h9 = h[9], carry0, carry1, carry2, carry3, carry4, carry5, carry6, carry7, carry8, carry9, q;
q = (19 * h9 + (1 << 24)) >> 25;
q = (h0 + q) >> 26;
q = (h1 + q) >> 25;
@@ -517,7 +308,7 @@ void __device__ __host__ fe_tobytes(unsigned char* __restrict__ s, const fe& __r
h8 -= carry8 << 26;
carry9 = h9 >> 25;
h9 -= carry9 << 25;
s[0] = static_cast<unsigned char>(h0 >> 0);
s[0] = static_cast<unsigned char>(h0);
s[1] = static_cast<unsigned char>(h0 >> 8);
s[2] = static_cast<unsigned char>(h0 >> 16);
s[3] = static_cast<unsigned char>((h0 >> 24) | (h1 << 2));
@@ -533,7 +324,7 @@ void __device__ __host__ fe_tobytes(unsigned char* __restrict__ s, const fe& __r
s[13] = static_cast<unsigned char>(h4 >> 2);
s[14] = static_cast<unsigned char>(h4 >> 10);
s[15] = static_cast<unsigned char>(h4 >> 18);
s[16] = static_cast<unsigned char>(h5 >> 0);
s[16] = static_cast<unsigned char>(h5);
s[17] = static_cast<unsigned char>(h5 >> 8);
s[18] = static_cast<unsigned char>(h5 >> 16);
s[19] = static_cast<unsigned char>((h5 >> 24) | (h6 << 1));

View File

@@ -28,24 +28,14 @@ __device__ __constant__ unsigned long round_k[80] = {
0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL, 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL,
};
__device__ __forceinline__ unsigned long load64(const unsigned char* __restrict__ x) {
unsigned char h[8];
h[0] = x[0];
h[1] = x[1];
h[2] = x[2];
h[3] = x[3];
h[4] = x[4];
h[5] = x[5];
h[6] = x[6];
h[7] = x[7];
return (static_cast<unsigned long>(h[0]) << 56) | (static_cast<unsigned long>(h[1]) << 48) | (static_cast<unsigned long>(h[2]) << 40) | (static_cast<unsigned long>(h[3]) << 32) |
(static_cast<unsigned long>(h[4]) << 24) | (static_cast<unsigned long>(h[5]) << 16) | (static_cast<unsigned long>(h[6]) << 8) | (static_cast<unsigned long>(h[7]));
return (static_cast<unsigned long>(x[0]) << 56) | (static_cast<unsigned long>(x[1]) << 48) | (static_cast<unsigned long>(x[2]) << 40) | (static_cast<unsigned long>(x[3]) << 32) | (static_cast<unsigned long>(x[4]) << 24) | (static_cast<unsigned long>(x[5]) << 16) | (static_cast<unsigned long>(x[6]) << 8) | (static_cast<unsigned long>(x[7]));
}
__device__ __forceinline__ void store64(unsigned char* __restrict__ x, unsigned long v) {
__device__ inline void store64(unsigned char* __restrict__ x, unsigned long v) {
#pragma unroll 8
for (unsigned char i = 0; i < 8; i++) x[i] = static_cast<unsigned char>(v >> (56 - i * 8));
}
#define rot64(x, bits) ((x >> bits) | (x << (64 - bits)))
__device__ __forceinline__ void sha512_block(sha512_state* __restrict__ s, const unsigned char* __restrict__ blk) {
__device__ void sha512_block(sha512_state* __restrict__ s, const unsigned char* __restrict__ blk) {
unsigned long w[16];
#pragma unroll 16
for (unsigned char i = 0; i < 16; i++) {

View File

@@ -106,7 +106,7 @@ __global__ void KeyGenKernel(curandState* __restrict__ randStates) {
KeysBox32 keys;
rmbytes(seed, &localState);
ed25519_create_keypair(keys.PrivateKey, keys.PublicKey, seed);
if (unsigned zeros = getZeros(keys.PublicKey); zeros > atomicMax(&d_high, zeros)) {
if (unsigned char zeros = getZeros(keys.PublicKey); zeros > atomicMax(&d_high, zeros)) {
Addr16 raw;
Key32 inv;
invertKey(keys.PublicKey, inv);
@@ -165,8 +165,7 @@ int main(int argc, char* argv[]) {
free(h_seeds);
cudaFree(d_seeds);
#endif
KeyGenKernel<<<THDIVTHPB, THREADSPB>>>(rst);
cudaDeviceSynchronize();
KeyGenKernel << <THDIVTHPB, THREADSPB >> > (rst);
cudaFree(rst);
return 0;
}