This commit is contained in:
rcxpony
2025-08-17 15:39:13 +05:00
parent 279f949dff
commit 8d83b3198d
7 changed files with 94 additions and 85 deletions

Binary file not shown.

Binary file not shown.

View File

@@ -1,11 +1,17 @@
cmake_minimum_required(VERSION 3.31)
project(yggm)
set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
file(GLOB ${PROJECT_NAME}SOURCES *.cpp)
file(GLOB ${PROJECT_NAME}HEADERS *.h)
add_executable(${PROJECT_NAME} sources/main.cpp)
target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_SOURCE_DIR}/libs)
if(CMAKE_BUILD_TYPE)
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
add_definitions(-DDEBUG)
@@ -28,21 +34,23 @@ else()
endif()
set(CXX_ADDITIONAL_FLAGS
"-fomit-frame-pointer \
-funroll-loops \
-ftree-vectorize \
-ftree-slp-vectorize \
-fdelete-null-pointer-checks \
-fno-exceptions \
-fno-rtti \
-funsafe-math-optimizations \
-fstrict-aliasing \
-fstrict-overflow \
-fno-stack-protector \
-fno-math-errno")
"-fomit-frame-pointer \
-funroll-loops \
-ftree-vectorize \
-ftree-slp-vectorize \
-fdelete-null-pointer-checks \
-fno-exceptions \
-fno-rtti \
-funsafe-math-optimizations \
-fstrict-aliasing \
-fstrict-overflow \
-fno-stack-protector \
-fno-math-errno")
set(CXXFLAGSR "-march=native -O3 -ffast-math -pipe -Wall -Wextra -Wpedantic -Wconversion -Wuninitialized -Wsign-conversion -flto")
set(CXXFLAGSD "-O0 -pipe -Wall -Wextra -Wpedantic -Wconversion -Wuninitialized -Wsign-conversion -g")
set(CMAKE_CXX_FLAGS_RELEASE "${CXXFLAGSR} ${CXX_ADDITIONAL_FLAGS}")
set(CMAKE_CXX_FLAGS_DEBUG "${CXXFLAGSD}")
target_link_libraries(${PROJECT_NAME} pthread sodium)
target_link_libraries(${PROJECT_NAME} sodium)

View File

@@ -1,5 +1,5 @@
NVCC := nvcc
NVCC_FLAGS := -rdc=true -O3 -Xptxas -O3 \
NVCC_FLAGS := -rdc=true -O3 -Xptxas -O3 \
-use_fast_math -ftz=true -prec-div=false -prec-sqrt=false \
-gencode arch=compute_75,code=sm_75 \
--default-stream per-thread \

View File

@@ -27,9 +27,9 @@ void __device__ __host__ fe_1(fe& __restrict__ h) {
}
//h = f + g
void __device__ __host__ fe_add(fe& __restrict__ h, const fe& __restrict__ f, const fe& __restrict__ g) {
signed int f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4], f5 = f[5], f6 = f[6], f7 = f[7], f8 = f[8], f9 = f[9];
signed int g0 = g[0], g1 = g[1], g2 = g[2], g3 = g[3], g4 = g[4], g5 = g[5], g6 = g[6], g7 = g[7], g8 = g[8], g9 = g[9];
signed int h0 = f0 + g0, h1 = f1 + g1, h2 = f2 + g2, h3 = f3 + g3, h4 = f4 + g4, h5 = f5 + g5, h6 = f6 + g6, h7 = f7 + g7, h8 = f8 + g8, h9 = f9 + g9;
int f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4], f5 = f[5], f6 = f[6], f7 = f[7], f8 = f[8], f9 = f[9];
int g0 = g[0], g1 = g[1], g2 = g[2], g3 = g[3], g4 = g[4], g5 = g[5], g6 = g[6], g7 = g[7], g8 = g[8], g9 = g[9];
int h0 = f0 + g0, h1 = f1 + g1, h2 = f2 + g2, h3 = f3 + g3, h4 = f4 + g4, h5 = f5 + g5, h6 = f6 + g6, h7 = f7 + g7, h8 = f8 + g8, h9 = f9 + g9;
h[0] = h0;
h[1] = h1;
h[2] = h2;
@@ -42,10 +42,10 @@ void __device__ __host__ fe_add(fe& __restrict__ h, const fe& __restrict__ f, co
h[9] = h9;
}
void __device__ __host__ fe_cmov(fe& __restrict__ f, const fe& __restrict__ g, const unsigned b) {
signed int x = -b;
signed int f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4], f5 = f[5], f6 = f[6], f7 = f[7], f8 = f[8], f9 = f[9];
signed int g0 = g[0], g1 = g[1], g2 = g[2], g3 = g[3], g4 = g[4], g5 = g[5], g6 = g[6], g7 = g[7], g8 = g[8], g9 = g[9];
signed int x0 = (f0 ^ g0) & x, x1 = (f1 ^ g1) & x, x2 = (f2 ^ g2) & x, x3 = (f3 ^ g3) & x, x4 = (f4 ^ g4) & x, x5 = (f5 ^ g5) & x, x6 = (f6 ^ g6) & x, x7 = (f7 ^ g7) & x, x8 = (f8 ^ g8) & x, x9 = (f9 ^ g9) & x;
int x = -b;
int f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4], f5 = f[5], f6 = f[6], f7 = f[7], f8 = f[8], f9 = f[9];
int g0 = g[0], g1 = g[1], g2 = g[2], g3 = g[3], g4 = g[4], g5 = g[5], g6 = g[6], g7 = g[7], g8 = g[8], g9 = g[9];
int x0 = (f0 ^ g0) & x, x1 = (f1 ^ g1) & x, x2 = (f2 ^ g2) & x, x3 = (f3 ^ g3) & x, x4 = (f4 ^ g4) & x, x5 = (f5 ^ g5) & x, x6 = (f6 ^ g6) & x, x7 = (f7 ^ g7) & x, x8 = (f8 ^ g8) & x, x9 = (f9 ^ g9) & x;
f[0] = f0 ^ x0;
f[1] = f1 ^ x1;
f[2] = f2 ^ x2;
@@ -59,7 +59,7 @@ void __device__ __host__ fe_cmov(fe& __restrict__ f, const fe& __restrict__ g, c
}
//h[0..9] = f[0..9]
void __device__ __host__ fe_copy(fe& __restrict__ h, const fe& __restrict__ f) {
signed int f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4], f5 = f[5], f6 = f[6], f7 = f[7], f8 = f[8], f9 = f[9];
int f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4], f5 = f[5], f6 = f[6], f7 = f[7], f8 = f[8], f9 = f[9];
h[0] = f0;
h[1] = f1;
h[2] = f2;
@@ -131,19 +131,19 @@ void __device__ __host__ fe_invert(fe& __restrict__ out, const fe& __restrict__
fe_mul(t1, t2, t1);
fe_mul(t2, t1, t1);
#pragma unroll 49
for (signed int i = 0; i < 49; ++i) {
for (int i = 0; i < 49; ++i) {
fe_mul(t2, t2, t2);
}
fe_mul(t2, t2, t1);
fe_mul(t3, t2, t2);
#pragma unroll 99
for (signed int i = 0; i < 99; ++i) {
for (int i = 0; i < 99; ++i) {
fe_mul(t3, t3, t3);
}
fe_mul(t2, t3, t2);
fe_mul(t2, t2, t2);
#pragma unroll 49
for (signed int i = 0; i < 49; ++i) {
for (int i = 0; i < 49; ++i) {
fe_mul(t2, t2, t2);
}
fe_mul(t1, t2, t1);
@@ -161,29 +161,29 @@ int __device__ __host__ fe_isnegative(const fe& __restrict__ f) {
}
//h = f * g
void __device__ __host__ fe_mul(fe& __restrict__ h, const fe& __restrict__ f, const fe& __restrict__ g) {
signed long f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4], f5 = f[5], f6 = f[6], f7 = f[7], f8 = f[8], f9 = f[9];
signed long g0 = g[0], g1 = g[1], g2 = g[2], g3 = g[3], g4 = g[4], g5 = g[5], g6 = g[6], g7 = g[7], g8 = g[8], g9 = g[9];
signed long f0g0 = f0 * g0, f0g1 = f0 * g1, f0g2 = f0 * g2, f0g3 = f0 * g3, f0g4 = f0 * g4, f0g5 = f0 * g5, f0g6 = f0 * g6, f0g7 = f0 * g7, f0g8 = f0 * g8, f0g9 = f0 * g9;
signed long f1g0 = f1 * g0, f1g1_2 = f1 * g1 << 1L, f1g2 = f1 * g2, f1g3_2 = f1 * g3 << 1L, f1g4 = f1 * g4, f1g5_2 = f1 * g5 << 1L, f1g6 = f1 * g6, f1g7_2 = f1 * g7 << 1L, f1g8 = f1 * g8, f1g9_38 = f1 * g9 * 38L;
signed long f2g0 = f2 * g0, f2g1 = f2 * g1, f2g2 = f2 * g2, f2g3 = f2 * g3, f2g4 = f2 * g4, f2g5 = f2 * g5, f2g6 = f2 * g6, f2g7 = f2 * g7, f2g8_19 = f2 * g8 * 19L, f2g9_19 = f2 * g9 * 38L >> 1L;
signed long f3g0 = f3 * g0, f3g1_2 = f3 * g1 << 1L, f3g2 = f3 * g2, f3g3_2 = f3 * g3 << 1L, f3g4 = f3 * g4, f3g5_2 = f3 * g5 << 1L, f3g6 = f3 * g6, f3g7_38 = f3 * g7 * 38L, f3g8_19 = f3 * g8 * 19L, f3g9_38 = f3 * g9 * 38L;
signed long f4g0 = f4 * g0, f4g1 = f4 * g1, f4g2 = f4 * g2, f4g3 = f4 * g3, f4g4 = f4 * g4, f4g5 = f4 * g5, f4g6_19 = f4 * g6 * 19L, f4g7_19 = f4 * g7 * 38L >> 1L, f4g8_19 = f4 * g8 * 19L, f4g9_19 = f4 * g9 * 38L >> 1L;
signed long f5g0 = f5 * g0, f5g1_2 = f5 * g1 << 1L, f5g2 = f5 * g2, f5g3_2 = f5 * g3 << 1L, f5g4 = f5 * g4, f5g5_38 = f5 * g5 * 38L, f5g6_19 = f5 * g6 * 19L, f5g7_38 = f5 * g7 * 38L, f5g8_19 = f5 * g8 * 19L, f5g9_38 = f5 * g9 * 38L;
signed long f6g0 = f6 * g0, f6g1 = f6 * g1, f6g2 = f6 * g2, f6g3 = f6 * g3, f6g4_19 = f6 * g4 * 19L, f6g5_19 = f6 * g5 * 38L >> 1L, f6g6_19 = f6 * g6 * 19L, f6g7_19 = f6 * g7 * 38L >> 1L, f6g8_19 = f6 * g8 * 19L, f6g9_19 = f6 * g9 * 38L >> 1L;
signed long f7g0 = f7 * g0, f7g1_2 = f7 * g1 << 1L, f7g2 = f7 * g2, f7g3_38 = f7 * g3 * 38L, f7g4_19 = f7 * g4 * 19L, f7g5_38 = f7 * g5 * 38L, f7g6_19 = f7 * g6 * 19L, f7g7_38 = f7 * g7 * 38L, f7g8_19 = f7 * g8 * 19L, f7g9_38 = f7 * g9 * 38L;
signed long f8g0 = f8 * g0, f8g1 = f8 * g1, f8g2_19 = f8 * g2 * 19L, f8g3_19 = f8 * g3 * 38L >> 1L, f8g4_19 = f8 * g4 * 19L, f8g5_19 = f8 * g5 * 38L >> 1L, f8g6_19 = f8 * g6 * 19L, f8g7_19 = f8 * g7 * 38L >> 1L, f8g8_19 = f8 * g8 * 19L, f8g9_19 = f8 * g9 * 38L >> 1L;
signed long f9g0 = f9 * g0, f9g1_38 = f9 * g1 * 38L, f9g2_19 = f9 * g2 * 19L, f9g3_38 = f9 * g3 * 38L, f9g4_19 = f9 * g4 * 19L, f9g5_38 = f9 * g5 * 38L, f9g6_19 = f9 * g6 * 19L, f9g7_38 = f9 * g7 * 38L, f9g8_19 = f9 * g8 * 19L, f9g9_38 = f9 * g9 * 38L;
signed long h0 = f0g0 + f1g9_38 + f2g8_19 + f3g7_38 + f4g6_19 + f5g5_38 + f6g4_19 + f7g3_38 + f8g2_19 + f9g1_38;
signed long h1 = f0g1 + f1g0 + f2g9_19 + f3g8_19 + f4g7_19 + f5g6_19 + f6g5_19 + f7g4_19 + f8g3_19 + f9g2_19;
signed long h2 = f0g2 + f1g1_2 + f2g0 + f3g9_38 + f4g8_19 + f5g7_38 + f6g6_19 + f7g5_38 + f8g4_19 + f9g3_38;
signed long h3 = f0g3 + f1g2 + f2g1 + f3g0 + f4g9_19 + f5g8_19 + f6g7_19 + f7g6_19 + f8g5_19 + f9g4_19;
signed long h4 = f0g4 + f1g3_2 + f2g2 + f3g1_2 + f4g0 + f5g9_38 + f6g8_19 + f7g7_38 + f8g6_19 + f9g5_38;
signed long h5 = f0g5 + f1g4 + f2g3 + f3g2 + f4g1 + f5g0 + f6g9_19 + f7g8_19 + f8g7_19 + f9g6_19;
signed long h6 = f0g6 + f1g5_2 + f2g4 + f3g3_2 + f4g2 + f5g1_2 + f6g0 + f7g9_38 + f8g8_19 + f9g7_38;
signed long h7 = f0g7 + f1g6 + f2g5 + f3g4 + f4g3 + f5g2 + f6g1 + f7g0 + f8g9_19 + f9g8_19;
signed long h8 = f0g8 + f1g7_2 + f2g6 + f3g5_2 + f4g4 + f5g3_2 + f6g2 + f7g1_2 + f8g0 + f9g9_38;
signed long h9 = f0g9 + f1g8 + f2g7 + f3g6 + f4g5 + f5g4 + f6g3 + f7g2 + f8g1 + f9g0;
signed long carry = (h0 + 33554432L) >> 26L; h1 += carry; h0 -= carry << 26L;
long f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4], f5 = f[5], f6 = f[6], f7 = f[7], f8 = f[8], f9 = f[9];
long g0 = g[0], g1 = g[1], g2 = g[2], g3 = g[3], g4 = g[4], g5 = g[5], g6 = g[6], g7 = g[7], g8 = g[8], g9 = g[9];
long f0g0 = f0 * g0, f0g1 = f0 * g1, f0g2 = f0 * g2, f0g3 = f0 * g3, f0g4 = f0 * g4, f0g5 = f0 * g5, f0g6 = f0 * g6, f0g7 = f0 * g7, f0g8 = f0 * g8, f0g9 = f0 * g9;
long f1g0 = f1 * g0, f1g1_2 = f1 * g1 << 1L, f1g2 = f1 * g2, f1g3_2 = f1 * g3 << 1L, f1g4 = f1 * g4, f1g5_2 = f1 * g5 << 1L, f1g6 = f1 * g6, f1g7_2 = f1 * g7 << 1L, f1g8 = f1 * g8, f1g9_38 = f1 * g9 * 38L;
long f2g0 = f2 * g0, f2g1 = f2 * g1, f2g2 = f2 * g2, f2g3 = f2 * g3, f2g4 = f2 * g4, f2g5 = f2 * g5, f2g6 = f2 * g6, f2g7 = f2 * g7, f2g8_19 = f2 * g8 * 19L, f2g9_19 = f2 * g9 * 38L >> 1L;
long f3g0 = f3 * g0, f3g1_2 = f3 * g1 << 1L, f3g2 = f3 * g2, f3g3_2 = f3 * g3 << 1L, f3g4 = f3 * g4, f3g5_2 = f3 * g5 << 1L, f3g6 = f3 * g6, f3g7_38 = f3 * g7 * 38L, f3g8_19 = f3 * g8 * 19L, f3g9_38 = f3 * g9 * 38L;
long f4g0 = f4 * g0, f4g1 = f4 * g1, f4g2 = f4 * g2, f4g3 = f4 * g3, f4g4 = f4 * g4, f4g5 = f4 * g5, f4g6_19 = f4 * g6 * 19L, f4g7_19 = f4 * g7 * 38L >> 1L, f4g8_19 = f4 * g8 * 19L, f4g9_19 = f4 * g9 * 38L >> 1L;
long f5g0 = f5 * g0, f5g1_2 = f5 * g1 << 1L, f5g2 = f5 * g2, f5g3_2 = f5 * g3 << 1L, f5g4 = f5 * g4, f5g5_38 = f5 * g5 * 38L, f5g6_19 = f5 * g6 * 19L, f5g7_38 = f5 * g7 * 38L, f5g8_19 = f5 * g8 * 19L, f5g9_38 = f5 * g9 * 38L;
long f6g0 = f6 * g0, f6g1 = f6 * g1, f6g2 = f6 * g2, f6g3 = f6 * g3, f6g4_19 = f6 * g4 * 19L, f6g5_19 = f6 * g5 * 38L >> 1L, f6g6_19 = f6 * g6 * 19L, f6g7_19 = f6 * g7 * 38L >> 1L, f6g8_19 = f6 * g8 * 19L, f6g9_19 = f6 * g9 * 38L >> 1L;
long f7g0 = f7 * g0, f7g1_2 = f7 * g1 << 1L, f7g2 = f7 * g2, f7g3_38 = f7 * g3 * 38L, f7g4_19 = f7 * g4 * 19L, f7g5_38 = f7 * g5 * 38L, f7g6_19 = f7 * g6 * 19L, f7g7_38 = f7 * g7 * 38L, f7g8_19 = f7 * g8 * 19L, f7g9_38 = f7 * g9 * 38L;
long f8g0 = f8 * g0, f8g1 = f8 * g1, f8g2_19 = f8 * g2 * 19L, f8g3_19 = f8 * g3 * 38L >> 1L, f8g4_19 = f8 * g4 * 19L, f8g5_19 = f8 * g5 * 38L >> 1L, f8g6_19 = f8 * g6 * 19L, f8g7_19 = f8 * g7 * 38L >> 1L, f8g8_19 = f8 * g8 * 19L, f8g9_19 = f8 * g9 * 38L >> 1L;
long f9g0 = f9 * g0, f9g1_38 = f9 * g1 * 38L, f9g2_19 = f9 * g2 * 19L, f9g3_38 = f9 * g3 * 38L, f9g4_19 = f9 * g4 * 19L, f9g5_38 = f9 * g5 * 38L, f9g6_19 = f9 * g6 * 19L, f9g7_38 = f9 * g7 * 38L, f9g8_19 = f9 * g8 * 19L, f9g9_38 = f9 * g9 * 38L;
long h0 = f0g0 + f1g9_38 + f2g8_19 + f3g7_38 + f4g6_19 + f5g5_38 + f6g4_19 + f7g3_38 + f8g2_19 + f9g1_38;
long h1 = f0g1 + f1g0 + f2g9_19 + f3g8_19 + f4g7_19 + f5g6_19 + f6g5_19 + f7g4_19 + f8g3_19 + f9g2_19;
long h2 = f0g2 + f1g1_2 + f2g0 + f3g9_38 + f4g8_19 + f5g7_38 + f6g6_19 + f7g5_38 + f8g4_19 + f9g3_38;
long h3 = f0g3 + f1g2 + f2g1 + f3g0 + f4g9_19 + f5g8_19 + f6g7_19 + f7g6_19 + f8g5_19 + f9g4_19;
long h4 = f0g4 + f1g3_2 + f2g2 + f3g1_2 + f4g0 + f5g9_38 + f6g8_19 + f7g7_38 + f8g6_19 + f9g5_38;
long h5 = f0g5 + f1g4 + f2g3 + f3g2 + f4g1 + f5g0 + f6g9_19 + f7g8_19 + f8g7_19 + f9g6_19;
long h6 = f0g6 + f1g5_2 + f2g4 + f3g3_2 + f4g2 + f5g1_2 + f6g0 + f7g9_38 + f8g8_19 + f9g7_38;
long h7 = f0g7 + f1g6 + f2g5 + f3g4 + f4g3 + f5g2 + f6g1 + f7g0 + f8g9_19 + f9g8_19;
long h8 = f0g8 + f1g7_2 + f2g6 + f3g5_2 + f4g4 + f5g3_2 + f6g2 + f7g1_2 + f8g0 + f9g9_38;
long h9 = f0g9 + f1g8 + f2g7 + f3g6 + f4g5 + f5g4 + f6g3 + f7g2 + f8g1 + f9g0;
long carry = (h0 + 33554432L) >> 26L; h1 += carry; h0 -= carry << 26L;
carry = (h4 + 33554432L) >> 26L; h5 += carry; h4 -= carry << 26L;
carry = (h1 + 16777216L) >> 25L; h2 += carry; h1 -= carry << 25L;
carry = (h5 + 16777216L) >> 25L; h6 += carry; h5 -= carry << 25L;
@@ -208,8 +208,8 @@ void __device__ __host__ fe_mul(fe& __restrict__ h, const fe& __restrict__ f, co
}
//h = -f
void __host__ __device__ fe_neg(fe& __restrict__ h, const fe& __restrict__ f) {
signed int f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4], f5 = f[5], f6 = f[6], f7 = f[7], f8 = f[8], f9 = f[9];
signed int h0 = -f0, h1 = -f1, h2 = -f2, h3 = -f3, h4 = -f4, h5 = -f5, h6 = -f6, h7 = -f7, h8 = -f8, h9 = -f9;
int f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4], f5 = f[5], f6 = f[6], f7 = f[7], f8 = f[8], f9 = f[9];
int h0 = -f0, h1 = -f1, h2 = -f2, h3 = -f3, h4 = -f4, h5 = -f5, h6 = -f6, h7 = -f7, h8 = -f8, h9 = -f9;
h[0] = h0;
h[1] = h1;
h[2] = h2;
@@ -222,19 +222,19 @@ void __host__ __device__ fe_neg(fe& __restrict__ h, const fe& __restrict__ f) {
h[9] = h9;
}
void __host__ __device__ fe_mul2(fe& __restrict__ h, const fe& __restrict__ f) {
const signed long f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4], f5 = f[5], f6 = f[6], f7 = f[7], f8 = f[8], f9 = f[9];
signed long f0_2 = f0 << 1, f1_2 = f1 << 1, f2_2 = f2 << 1, f3_2 = f3 << 1, f4_2 = f4 << 1, f5_2 = f5 << 1, f6_2 = f6 << 1, f7_2 = f7 << 1;
signed int f5_38 = 38 * f5, f6_19 = 19 * f6, f7_38 = 38 * f7, f8_19 = 19 * f8, f9_38 = 38 * f9;
signed long f0f0 = f0 * f0, f0f1_2 = f0_2 * f1, f0f2_2 = f0_2 * f2, f0f3_2 = f0_2 * f3, f0f4_2 = f0_2 * f4, f0f5_2 = f0_2 * f5, f0f6_2 = f0_2 * f6, f0f7_2 = f0_2 * f7, f0f8_2 = f0_2 * f8, f0f9_2 = f0_2 * f9;
signed long f1f1_2 = f1_2 * f1, f1f2_2 = f1_2 * f2, f1f3_4 = f1_2 * f3_2, f1f4_2 = f1_2 * f4, f1f5_4 = f1_2 * f5_2, f1f6_2 = f1_2 * f6, f1f7_4 = f1_2 * f7_2, f1f8_2 = f1_2 * f8, f1f9_76 = f1_2 * f9_38;
signed long f2f2 = f2 * f2, f2f3_2 = f2_2 * f3, f2f4_2 = f2_2 * f4, f2f5_2 = f2_2 * f5, f2f6_2 = f2_2 * f6, f2f7_2 = f2_2 * f7, f2f8_38 = f2_2 * f8_19, f2f9_38 = f2 * f9_38;
signed long f3f3_2 = f3_2 * f3, f3f4_2 = f3_2 * f4, f3f5_4 = f3_2 * f5_2, f3f6_2 = f3_2 * f6, f3f7_76 = f3_2 * f7_38, f3f8_38 = f3_2 * f8_19, f3f9_76 = f3_2 * f9_38;
signed long f4f4 = f4 * f4, f4f5_2 = f4_2 * f5, f4f6_38 = f4_2 * f6_19, f4f7_38 = f4 * f7_38, f4f8_38 = f4_2 * f8_19, f4f9_38 = f4 * f9_38;
signed long f5f5_38 = f5 * f5_38, f5f6_38 = f5_2 * f6_19, f5f7_76 = f5_2 * f7_38, f5f8_38 = f5_2 * f8_19, f5f9_76 = f5_2 * f9_38;
signed long f6f6_19 = f6 * f6_19, f6f7_38 = f6 * f7_38, f6f8_38 = f6_2 * f8_19, f6f9_38 = f6 * f9_38;
signed long f7f7_38 = f7 * f7_38, f7f8_38 = f7_2 * f8_19, f7f9_76 = f7_2 * f9_38;
signed long f8f8_19 = f8 * f8_19, f8f9_38 = f8 * f9_38;
signed long f9f9_38 = f9 * f9_38;
const long f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4], f5 = f[5], f6 = f[6], f7 = f[7], f8 = f[8], f9 = f[9];
long f0_2 = f0 << 1, f1_2 = f1 << 1, f2_2 = f2 << 1, f3_2 = f3 << 1, f4_2 = f4 << 1, f5_2 = f5 << 1, f6_2 = f6 << 1, f7_2 = f7 << 1;
int f5_38 = 38 * f5, f6_19 = 19 * f6, f7_38 = 38 * f7, f8_19 = 19 * f8, f9_38 = 38 * f9;
long f0f0 = f0 * f0, f0f1_2 = f0_2 * f1, f0f2_2 = f0_2 * f2, f0f3_2 = f0_2 * f3, f0f4_2 = f0_2 * f4, f0f5_2 = f0_2 * f5, f0f6_2 = f0_2 * f6, f0f7_2 = f0_2 * f7, f0f8_2 = f0_2 * f8, f0f9_2 = f0_2 * f9;
long f1f1_2 = f1_2 * f1, f1f2_2 = f1_2 * f2, f1f3_4 = f1_2 * f3_2, f1f4_2 = f1_2 * f4, f1f5_4 = f1_2 * f5_2, f1f6_2 = f1_2 * f6, f1f7_4 = f1_2 * f7_2, f1f8_2 = f1_2 * f8, f1f9_76 = f1_2 * f9_38;
long f2f2 = f2 * f2, f2f3_2 = f2_2 * f3, f2f4_2 = f2_2 * f4, f2f5_2 = f2_2 * f5, f2f6_2 = f2_2 * f6, f2f7_2 = f2_2 * f7, f2f8_38 = f2_2 * f8_19, f2f9_38 = f2 * f9_38;
long f3f3_2 = f3_2 * f3, f3f4_2 = f3_2 * f4, f3f5_4 = f3_2 * f5_2, f3f6_2 = f3_2 * f6, f3f7_76 = f3_2 * f7_38, f3f8_38 = f3_2 * f8_19, f3f9_76 = f3_2 * f9_38;
long f4f4 = f4 * f4, f4f5_2 = f4_2 * f5, f4f6_38 = f4_2 * f6_19, f4f7_38 = f4 * f7_38, f4f8_38 = f4_2 * f8_19, f4f9_38 = f4 * f9_38;
long f5f5_38 = f5 * f5_38, f5f6_38 = f5_2 * f6_19, f5f7_76 = f5_2 * f7_38, f5f8_38 = f5_2 * f8_19, f5f9_76 = f5_2 * f9_38;
long f6f6_19 = f6 * f6_19, f6f7_38 = f6 * f7_38, f6f8_38 = f6_2 * f8_19, f6f9_38 = f6 * f9_38;
long f7f7_38 = f7 * f7_38, f7f8_38 = f7_2 * f8_19, f7f9_76 = f7_2 * f9_38;
long f8f8_19 = f8 * f8_19, f8f9_38 = f8 * f9_38;
long f9f9_38 = f9 * f9_38;
long h0 = (f0f0 + f1f9_76 + f2f8_38 + f3f7_76 + f4f6_38 + f5f5_38) << 1;
long h1 = (f0f1_2 + f2f9_38 + f3f8_38 + f4f7_38 + f5f6_38) << 1;
long h2 = (f0f2_2 + f1f1_2 + f3f9_76 + f4f8_38 + f5f7_76 + f6f6_19) << 1;
@@ -245,7 +245,7 @@ void __host__ __device__ fe_mul2(fe& __restrict__ h, const fe& __restrict__ f) {
long h7 = (f0f7_2 + f1f6_2 + f2f5_2 + f3f4_2 + f8f9_38) << 1;
long h8 = (f0f8_2 + f1f7_4 + f2f6_2 + f3f5_4 + f4f4 + f9f9_38) << 1;
long h9 = (f0f9_2 + f1f8_2 + f2f7_2 + f3f6_2 + f4f5_2) << 1;
signed long carry = (h0 + 33554432L) >> 26L; h1 += carry; h0 -= carry << 26L;
long carry = (h0 + 33554432L) >> 26L; h1 += carry; h0 -= carry << 26L;
carry = (h4 + 33554432L) >> 26L; h5 += carry; h4 -= carry << 26L;
carry = (h1 + 16777216L) >> 25L; h2 += carry; h1 -= carry << 25L;
carry = (h5 + 16777216L) >> 25L; h6 += carry; h5 -= carry << 25L;
@@ -269,9 +269,9 @@ void __host__ __device__ fe_mul2(fe& __restrict__ h, const fe& __restrict__ f) {
h[9] = static_cast<int>(h9);
}
void __device__ __host__ fe_sub(fe& __restrict__ h, const fe& __restrict__ f, const fe& __restrict__ g) {
signed int f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4], f5 = f[5], f6 = f[6], f7 = f[7], f8 = f[8], f9 = f[9];
signed int g0 = g[0], g1 = g[1], g2 = g[2], g3 = g[3], g4 = g[4], g5 = g[5], g6 = g[6], g7 = g[7], g8 = g[8], g9 = g[9];
signed int h0 = f0 - g0, h1 = f1 - g1, h2 = f2 - g2, h3 = f3 - g3, h4 = f4 - g4, h5 = f5 - g5, h6 = f6 - g6, h7 = f7 - g7, h8 = f8 - g8, h9 = f9 - g9;
int f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4], f5 = f[5], f6 = f[6], f7 = f[7], f8 = f[8], f9 = f[9];
int g0 = g[0], g1 = g[1], g2 = g[2], g3 = g[3], g4 = g[4], g5 = g[5], g6 = g[6], g7 = g[7], g8 = g[8], g9 = g[9];
int h0 = f0 - g0, h1 = f1 - g1, h2 = f2 - g2, h3 = f3 - g3, h4 = f4 - g4, h5 = f5 - g5, h6 = f6 - g6, h7 = f7 - g7, h8 = f8 - g8, h9 = f9 - g9;
h[0] = h0;
h[1] = h1;
h[2] = h2;
@@ -284,7 +284,7 @@ void __device__ __host__ fe_sub(fe& __restrict__ h, const fe& __restrict__ f, co
h[9] = h9;
}
void __device__ __host__ fe_tobytes(unsigned char* __restrict__ s, const fe& __restrict__ h) {
signed int h0 = h[0], h1 = h[1], h2 = h[2], h3 = h[3], h4 = h[4], h5 = h[5], h6 = h[6], h7 = h[7], h8 = h[8], h9 = h[9], carry0, carry1, carry2, carry3, carry4, carry5, carry6, carry7, carry8, carry9, q;
int h0 = h[0], h1 = h[1], h2 = h[2], h3 = h[3], h4 = h[4], h5 = h[5], h6 = h[6], h7 = h[7], h8 = h[8], h9 = h[9], carry0, carry1, carry2, carry3, carry4, carry5, carry6, carry7, carry8, carry9, q;
q = (19 * h9 + (1 << 24)) >> 25;
q = (h0 + q) >> 26;
q = (h1 + q) >> 25;

View File

@@ -1,4 +1,4 @@
#include "../libs/defines.h"
#include <defines.h>
#include <arpa/inet.h>
#include <atomic>
#include <immintrin.h>

View File

@@ -1,26 +1,27 @@
#include <stdio.h>
#include <cuda_runtime.h>
#include <curand_kernel.h>
#include <sha512.cuh>
#include <defines.h>
#include <ed25519.cuh>
#include <edsign.cuh>
#include <string.cuh>
#include <keymanip.cuh>
#include <defines.h>
#include <sha512.cuh>
#include <stdio.h>
#include <string.cuh>
__device__ unsigned d_high = 0x14;
__device__ int parameters(const char* arg) noexcept {
if ((cstring_find(arg, "--altitude") == 0 && cstring_length(arg) == 10) ||
(cstring_find(arg, "-a") == 0 && cstring_length(arg) == 2)) {
if ((cstring_find(arg, "--altitude") == 0 && cstring_length(arg) == 10) || (cstring_find(arg, "-a") == 0 && cstring_length(arg) == 2)) {
return 777;
}
int space_index = cstring_find(arg, " ");
if (space_index == -1) return 0;
if (space_index == -1)
return 0;
const int substr_start = space_index + 1;
char sub_arg[256];
extract_substring(arg, substr_start, sub_arg, 256);
if (cstring_find(arg, "--altitude") != -1 || cstring_find(arg, "-a") != -1) {
unsigned tmp_high;
if (cstring_to_ull(sub_arg, &tmp_high) != 0) return 1;
if (cstring_to_ull(sub_arg, &tmp_high) != 0)
return 1;
d_high = tmp_high;
}
return 0;
@@ -98,7 +99,7 @@ __device__ __forceinline__ void rmbytes(unsigned char* __restrict__ buf, curandS
__global__ void KeyGenKernel(curandState* __restrict__ randStates) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
curandState localState = randStates[idx];
#ifdef DEBUG
#ifdef DEBUG
unsigned x = 0;
#endif
while (WHCOND) {
@@ -111,23 +112,23 @@ __global__ void KeyGenKernel(curandState* __restrict__ randStates) {
Key32 inv;
invertKey(keys.PublicKey, inv);
getRawAddress(zeros, inv, raw);
#ifdef RELEASE
#ifdef RELEASE
printf("\nIPv6:\t%s\nPK:\t%s\nSK:\t%s\n", getAddr(raw).data, ktos(keys.PublicKey).data, ktos(keys.PrivateKey).data);
#else
#else
printf("\nIPv6:\t%s\nFK:\t%s%s\n", getAddr(raw).data, ktos(keys.PrivateKey).data, ktos(keys.PublicKey).data);
#endif
#endif
d_high = zeros;
}
#ifdef DEBUG
#ifdef DEBUG
if ((++x & 0xFF) == 0) {
printf("\rIters: %d", x);
}
#endif
#endif
}
}
int main(int argc, char* argv[]) {
printf("BuildType: %s\n", __BUILDTYPE__);
int* d_result, mBpSM, h_high;
int *d_result, mBpSM, h_high;
char** d_argv;
cudaDeviceProp prop;
curandState* rst;
@@ -165,7 +166,7 @@ int main(int argc, char* argv[]) {
free(h_seeds);
cudaFree(d_seeds);
#endif
KeyGenKernel << <THDIVTHPB, THREADSPB >> > (rst);
KeyGenKernel<<<THDIVTHPB, THREADSPB>>>(rst);
cudaFree(rst);
return 0;
}