Compare commits

44 Commits
v0.1.0 ... main

Author SHA1 Message Date
6fa8acd2e3 .ll 2025-09-18 18:24:47 +05:00
rcxpony
5bf02d95ab meson update 2025-09-03 22:00:46 +05:00
rcxpony
361964edf7 owo 2025-08-21 13:06:32 +05:00
rcxpony
ab6061f5bf owo 2025-08-21 13:05:05 +05:00
rcxpony
4b5a9967ed owo 2025-08-21 04:52:43 +05:00
rcxpony
558b6b3a2c meson build instructions 2025-08-21 03:50:51 +05:00
rcxpony
f0befd4925 meson instead cmake/makefile 2025-08-21 03:44:39 +05:00
rcxpony
2944a25784 meson instead cmake/makefile 2025-08-21 03:44:21 +05:00
rcxpony
f2b96920fb type conv 2025-08-20 14:13:10 +05:00
rcxpony
75c78b5a35 sosiski 2025-08-17 15:40:13 +05:00
rcxpony
8d83b3198d sosiski 2025-08-17 15:39:13 +05:00
rcxpony
279f949dff cmake fixes, atomic changes,.clang-format 2025-08-07 06:17:29 +05:00
rcxpony
fa31fc69b0 non-avx2 support and build fix 2025-07-15 22:28:58 +05:00
rcxpony
e591951b69 non-avx2 support and build fix 2025-07-15 22:27:45 +05:00
rcxpony
8b33e5d73d non-avx2 support and build fix 2025-07-15 22:20:45 +05:00
rcxpony
6dd774c206 non-avx2 support and build fix 2025-07-15 22:19:24 +05:00
rcxpony
d3dd7b7e4d fixes 2025-03-27 05:48:03 +05:00
rcxpony
137db6f6b2 fixes 2025-03-27 05:45:15 +05:00
rcxpony
c7797dd663 fixes 2025-03-22 02:00:36 +05:00
rcxpony
37a2f9cf52 fixes 2025-03-20 13:19:18 +05:00
rcxpony
14fd274f36 ptr to ref 2025-03-20 13:00:13 +05:00
rcxpony
d01662bd63 fixes 2025-03-20 02:38:01 +05:00
rcxpony
3cb19e527e fixes 2025-03-19 22:49:28 +05:00
rcxpony
98ddd7387a fixes 2025-03-19 20:53:54 +05:00
rcxpony
c37db7679b fixes 2025-03-19 20:51:33 +05:00
rcxpony
988534bc45 fixes 2025-03-18 20:39:28 +05:00
rcxpony
a9a693dc07 fixes 2025-03-18 03:46:01 +05:00
rcxpony
1499fc5d28 fixes 2025-03-18 03:33:23 +05:00
rcxpony
6d92f9ee43 fixes 2025-03-18 03:17:20 +05:00
rcxpony
b17df465d5 fixes 2025-03-17 22:14:53 +05:00
b0c15fad82 fixes 2025-03-17 19:20:29 +05:00
5b543aa710 fixes 2025-03-17 15:33:07 +05:00
7ea0e62697 optimized 2025-03-17 15:04:47 +05:00
bc814d988c optimized 2025-03-17 05:32:00 +05:00
b7240b3d8d optimized 2025-03-16 22:03:05 +05:00
9e6cff28fd test 2025-03-16 18:27:56 +05:00
735cc0c467 fixes 2025-03-15 21:19:30 +05:00
4f5a8129ae fixes 2025-03-15 19:54:15 +05:00
e6c4f9ceb5 fixes 2025-03-15 19:36:02 +05:00
17f6d7be58 fixes 2025-03-15 16:11:56 +05:00
cfecea84ec fixes 2025-03-15 15:54:52 +05:00
f333047c1f code cleanup 2025-03-15 15:35:48 +05:00
6198670dc4 fixes 2025-03-15 14:40:30 +05:00
da1d9b12ca readme 2025-03-15 13:35:35 +05:00
23 changed files with 2613 additions and 521 deletions

271
.clang-format Normal file
View File

@@ -0,0 +1,271 @@
BasedOnStyle: WebKit
AccessModifierOffset: -4
AlignAfterOpenBracket: DontAlign
AlignArrayOfStructures: None
AlignConsecutiveAssignments:
Enabled: false
AcrossEmptyLines: false
AcrossComments: false
AlignCompound: false
AlignFunctionPointers: false
PadOperators: true
AlignConsecutiveBitFields:
Enabled: false
AcrossEmptyLines: false
AcrossComments: false
AlignCompound: false
AlignFunctionPointers: false
PadOperators: false
AlignConsecutiveDeclarations:
Enabled: false
AcrossEmptyLines: false
AcrossComments: false
AlignCompound: false
AlignFunctionPointers: false
PadOperators: false
AlignConsecutiveMacros:
Enabled: false
AcrossEmptyLines: false
AcrossComments: false
AlignCompound: false
AlignFunctionPointers: false
PadOperators: false
AlignConsecutiveShortCaseStatements:
Enabled: false
AcrossEmptyLines: false
AcrossComments: false
AlignCaseArrows: false
AlignCaseColons: false
AlignConsecutiveTableGenBreakingDAGArgColons:
Enabled: false
AcrossEmptyLines: false
AcrossComments: false
AlignCompound: false
AlignFunctionPointers: false
PadOperators: false
AlignConsecutiveTableGenCondOperatorColons:
Enabled: false
AcrossEmptyLines: false
AcrossComments: false
AlignCompound: false
AlignFunctionPointers: false
PadOperators: false
AlignConsecutiveTableGenDefinitionColons:
Enabled: false
AcrossEmptyLines: false
AcrossComments: false
AlignCompound: false
AlignFunctionPointers: false
PadOperators: false
AlignEscapedNewlines: Right
AlignOperands: DontAlign
AlignTrailingComments:
Kind: Never
OverEmptyLines: 0
AllowAllArgumentsOnNextLine: true
AllowAllParametersOfDeclarationOnNextLine: true
AllowBreakBeforeNoexceptSpecifier: Never
AllowShortBlocksOnASingleLine: Empty
AllowShortCaseExpressionOnASingleLine: true
AllowShortCaseLabelsOnASingleLine: false
AllowShortCompoundRequirementOnASingleLine: true
AllowShortEnumsOnASingleLine: true
AllowShortFunctionsOnASingleLine: All
AllowShortIfStatementsOnASingleLine: Never
AllowShortLambdasOnASingleLine: All
AllowShortLoopsOnASingleLine: false
AlwaysBreakAfterDefinitionReturnType: None
AlwaysBreakBeforeMultilineStrings: false
AttributeMacros:
- __capability
BinPackArguments: true
BinPackParameters: true
BitFieldColonSpacing: Both
BraceWrapping:
AfterCaseLabel: false
AfterClass: false
AfterControlStatement: Never
AfterEnum: false
AfterFunction: true
AfterNamespace: false
AfterObjCDeclaration: false
AfterStruct: false
AfterUnion: false
AfterExternBlock: false
BeforeCatch: false
BeforeElse: false
BeforeLambdaBody: false
BeforeWhile: false
IndentBraces: false
SplitEmptyFunction: true
SplitEmptyRecord: true
SplitEmptyNamespace: true
BreakAdjacentStringLiterals: true
BreakAfterAttributes: Leave
BreakAfterJavaFieldAnnotations: false
BreakAfterReturnType: None
BreakArrays: true
BreakBeforeBinaryOperators: All
BreakBeforeBraces: Attach
BreakBeforeConceptDeclarations: Always
BreakBeforeInlineASMColon: OnlyMultiline
BreakBeforeTernaryOperators: true
BreakConstructorInitializers: BeforeComma
BreakFunctionDefinitionParameters: false
BreakInheritanceList: BeforeColon
BreakStringLiterals: true
BreakTemplateDeclarations: MultiLine
ColumnLimit: 0
CommentPragmas: "^ IWYU pragma:"
CompactNamespaces: false
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: false
DerivePointerAlignment: false
DisableFormat: false
EmptyLineAfterAccessModifier: Never
EmptyLineBeforeAccessModifier: LogicalBlock
ExperimentalAutoDetectBinPacking: false
FixNamespaceComments: false
ForEachMacros:
- foreach
- Q_FOREACH
- BOOST_FOREACH
IfMacros:
- KJ_IF_MAYBE
IncludeBlocks: Preserve
IncludeCategories:
- Regex: ^"(llvm|llvm-c|clang|clang-c)/
Priority: 2
SortPriority: 0
CaseSensitive: false
- Regex: ^(<|"(gtest|gmock|isl|json)/)
Priority: 3
SortPriority: 0
CaseSensitive: false
- Regex: .*
Priority: 1
SortPriority: 0
CaseSensitive: false
IncludeIsMainRegex: (Test)?$
IncludeIsMainSourceRegex: ""
IndentAccessModifiers: false
IndentCaseBlocks: false
IndentCaseLabels: false
IndentExternBlock: AfterExternBlock
IndentGotoLabels: true
IndentPPDirectives: None
IndentRequiresClause: true
IndentWidth: 4
IndentWrappedFunctionNames: false
InsertBraces: false
InsertNewlineAtEOF: false
InsertTrailingCommas: None
IntegerLiteralSeparator:
Binary: 0
BinaryMinDigits: 0
Decimal: 0
DecimalMinDigits: 0
Hex: 0
HexMinDigits: 0
JavaScriptQuotes: Leave
JavaScriptWrapImports: true
KeepEmptyLines:
AtEndOfFile: false
AtStartOfBlock: true
AtStartOfFile: true
LambdaBodyIndentation: Signature
LineEnding: DeriveLF
MacroBlockBegin: ""
MacroBlockEnd: ""
MainIncludeChar: Quote
MaxEmptyLinesToKeep: 1
NamespaceIndentation: Inner
ObjCBinPackProtocolList: Auto
ObjCBlockIndentWidth: 4
ObjCBreakBeforeNestedBlockParam: true
ObjCSpaceAfterProperty: true
ObjCSpaceBeforeProtocolList: true
PPIndentWidth: -1
PackConstructorInitializers: BinPack
PenaltyBreakAssignment: 2
PenaltyBreakBeforeFirstCallParameter: 19
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakOpenParenthesis: 0
PenaltyBreakScopeResolution: 500
PenaltyBreakString: 1000
PenaltyBreakTemplateDeclaration: 10
PenaltyExcessCharacter: 1000000
PenaltyIndentedWhitespace: 0
PenaltyReturnTypeOnItsOwnLine: 60
PointerAlignment: Left
QualifierAlignment: Leave
ReferenceAlignment: Pointer
ReflowComments: true
RemoveBracesLLVM: false
RemoveParentheses: Leave
RemoveSemicolon: false
RequiresClausePosition: OwnLine
RequiresExpressionIndentation: OuterScope
SeparateDefinitionBlocks: Leave
ShortNamespaceLines: 1
SkipMacroDefinitionBody: false
SortIncludes: CaseSensitive
SortJavaStaticImport: Before
SortUsingDeclarations: LexicographicNumeric
SpaceAfterCStyleCast: false
SpaceAfterLogicalNot: false
SpaceAfterTemplateKeyword: true
SpaceAroundPointerQualifiers: Default
SpaceBeforeAssignmentOperators: true
SpaceBeforeCaseColon: false
SpaceBeforeCpp11BracedList: true
SpaceBeforeCtorInitializerColon: true
SpaceBeforeInheritanceColon: true
SpaceBeforeJsonColon: false
SpaceBeforeParens: ControlStatements
SpaceBeforeParensOptions:
AfterControlStatements: true
AfterForeachMacros: true
AfterFunctionDeclarationName: false
AfterFunctionDefinitionName: false
AfterIfMacros: true
AfterOverloadedOperator: false
AfterPlacementOperator: true
AfterRequiresInClause: false
AfterRequiresInExpression: false
BeforeNonEmptyParentheses: false
SpaceBeforeRangeBasedForLoopColon: true
SpaceBeforeSquareBrackets: false
SpaceInEmptyBlock: true
SpacesBeforeTrailingComments: 1
SpacesInAngles: Never
SpacesInContainerLiterals: true
SpacesInLineCommentPrefix:
Minimum: 1
Maximum: -1
SpacesInParens: Never
SpacesInParensOptions:
ExceptDoubleParentheses: false
InConditionalStatements: false
InCStyleCasts: false
InEmptyParentheses: false
Other: false
SpacesInSquareBrackets: false
Standard: Latest
StatementAttributeLikeMacros:
- Q_EMIT
StatementMacros:
- Q_UNUSED
- QT_REQUIRE_VERSION
TabWidth: 8
TableGenBreakInsideDAGArg: DontBreak
UseTab: Never
VerilogBreakBetweenInstancePorts: true
WhitespaceSensitiveMacros:
- BOOST_PP_STRINGIZE
- CF_SWIFT_NAME
- NS_SWIFT_NAME
- PP_STRINGIZE
- STRINGIZE

2
.clangd Normal file
View File

@@ -0,0 +1,2 @@
CompileFlags:
Remove: ['-Xcompiler=-Wall,-Winvalid-pch','-rdc=true','-prec-div=false','-Xptxas','-ftz=true','-prec-sqrt=false','-gencode','--default-stream','--expt-relaxed-constexpr','-G']

4
.gitignore vendored
View File

@@ -1,3 +1,3 @@
.vscode/
build/*
-Makefile
.cache/
build/

View File

@@ -1,11 +0,0 @@
cmake_minimum_required(VERSION 3.31)
project(yggm)
set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
file(GLOB ${PROJECT_NAME}SOURCES *.cpp)
file(GLOB ${PROJECT_NAME}HEADERS *.h)
add_executable(${PROJECT_NAME} sources/main.cpp)
set(CXX_ADDITIONAL_FLAGS "-mavx2 -fomit-frame-pointer -ftree-vectorize -ftree-slp-vectorize -fdelete-null-pointer-checks -fno-exceptions -fno-rtti")
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -mtune=native -ffast-math -pipe -Wall -Wextra -Wpedantic -Wconversion -Wuninitialized -Wsign-conversion -flto=full")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} ${CXX_ADDITIONAL_FLAGS}")
target_link_libraries(${PROJECT_NAME} pthread sodium)

View File

@@ -1,19 +1,18 @@
# yggm
### Yggdrasil address miner
# How to build for CPU
# How to build for CPU/GPU
### First, install gcc14 and clang. Then:
```sh
git clone https://rcxpony.name/rcxpony/yggm.git && cd yggm
cmake -B build && cmake --build build -j$(nproc)
build/yggm -t 10
```
# How to build for GPU
```sh
git clone https://rcxpony.name/rcxpony/yggm.git && cd yggm/build
make -j$(nproc)
./yggmcu -t 10
meson setup build -Dbuildtype=release --native-file native.ini && cd build
meson compile
./yggmc -t 10 // for CPU
./yggmcu -t 10 // for GPU
```
# ToDo
- [x] Cuda support (not optimized)
- [x] Support for avx2
- [ ] Support for sse4
- [x] CUDA support
- [x] AVX2 optimizations
- [ ] (CUDA) Optimize internal algorithms
- [ ] Windows support
- [ ] Tests
#

View File

@@ -1,31 +0,0 @@
NVCC := nvcc
NVCC_FLAGS := -rdc=true -O3 -Xptxas -O3 \
-use_fast_math -ftz=true -prec-div=false -prec-sqrt=false \
-gencode arch=compute_75,code=sm_75 \
--default-stream per-thread \
-Wno-deprecated-gpu-targets \
--expt-relaxed-constexpr \
-I../libs/ \
-std=c++17
BUILD ?= RELEASE
ifeq ($(BUILD),DEBUG)
BUILD_DEFINES := -DDEBUG
else
BUILD_DEFINES := -DRELEASE
endif
MAIN_SOURCE := ../sources/main.cu
LIBS_DIR := ../libs/
BUILD_DIR := ../build
LIBS_SOURCES := $(wildcard $(LIBS_DIR)*.cu)
LIBS_OBJECTS := $(patsubst $(LIBS_DIR)%.cu,$(BUILD_DIR)/%.o,$(LIBS_SOURCES))
TARGET := yggmcu
.PHONY: all clean
all: $(TARGET)
$(BUILD_DIR):
@mkdir -p $(BUILD_DIR)
$(TARGET): $(MAIN_SOURCE) $(LIBS_OBJECTS)
$(NVCC) $(NVCC_FLAGS) -o $@ $^
$(BUILD_DIR)/%.o: $(LIBS_DIR)%.cu | $(BUILD_DIR)
$(NVCC) $(NVCC_FLAGS) -c $< -o $@
clean:
@rm -f $(BUILD_DIR)/*.o $(TARGET)

14
libs/defines.h Normal file
View File

@@ -0,0 +1,14 @@
#ifndef __DEFINES_H
#define __DEFINES_H
#ifdef DEBUG
#define __BUILDTYPE__ "Debug"
#define THREADSPB 1
#define THDIVTHPB 1
#define WHCOND x < 0xFFFFFFFF
#else
#define __BUILDTYPE__ "Release"
#define THREADSPB 256
#define THDIVTHPB (tTh / THREADSPB)
#define WHCOND true
#endif
#endif

View File

@@ -1,106 +1,156 @@
#include <ed25519.cuh>
#include <f25519.cuh>
__device__ __constant__ struct ed25519_pt ed25519_base = {
{0x1a,0xd5,0x25,0x8f,0x60,0x2d,0x56,0xc9,0xb2,0xa7,0x25,0x95,0x60,0xc7,0x2c,0x69,
0x5c,0xdc,0xd6,0xfd,0x31,0xe2,0xa4,0xc0,0xfe,0x53,0x6e,0xcd,0xd3,0x36,0x69,0x21},
{0x58,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,
0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66,0x66},
{0xa3,0xdd,0xb7,0xa5,0xb3,0x8a,0xde,0x6d,0xf5,0x52,0x51,0x77,0x80,0x9f,0xf0,0x20,
0x7d,0xe3,0xab,0x64,0x8e,0x4e,0xea,0x66,0x65,0x76,0x8b,0xd7,0x0f,0x5f,0x87,0x67},
{1,0}
};
__device__ __constant__ struct ed25519_pt ed25519_neutral = {
{0}, {1,0}, {0}, {1,0}
};
__device__ __constant__ unsigned char ed25519_d[32] = {
0xa3,0x78,0x59,0x13,0xca,0x4d,0xeb,0x75,0xab,0xd8,0x41,0x41,0x4d,0x0a,0x70,0x00,
0x98,0xe8,0x79,0x77,0x79,0x40,0xc7,0x8c,0x73,0xfe,0x6f,0x2b,0xee,0x6c,0x03,0x52
};
__device__ __constant__ unsigned char ed25519_k[32] = {
0x59,0xf1,0xb2,0x26,0x94,0x9b,0xd6,0xeb,0x56,0xb1,0x83,0x82,0x9a,0x14,0xe0,0x00,
0x30,0xd1,0xf3,0xee,0xf2,0x80,0x8e,0x19,0xe7,0xfc,0xdf,0x56,0xdc,0xd9,0x06,0x24
};
__device__ void ed25519_unproject(unsigned char* __restrict__ x, unsigned char* __restrict__ y, const struct ed25519_pt* __restrict__ p) {
unsigned char z1[32];
f25519_inv__distinct(z1, p->z);
f25519_mul__distinct(x, p->x, z1);
f25519_mul__distinct(y, p->y, z1);
f25519_normalize(x);
f25519_normalize(y);
#include <precomp_data.h>
void __host__ __device__ ge_madd(ge_p1p1& __restrict__ r, const ge_p3& __restrict__ p, const precomp_data& __restrict__ q) {
fe t0;
fe_add(r.X, p.Y, p.X);
fe_sub(r.Y, p.Y, p.X);
fe_mul(r.Z, r.X, q.yplusx);
fe_mul(r.Y, r.Y, q.yminusx);
fe_mul(r.T, q.xy2d, p.T);
fe_add(t0, p.Z, p.Z);
fe_sub(r.X, r.Z, r.Y);
fe_add(r.Y, r.Z, r.Y);
fe_add(r.Z, t0, r.T);
fe_sub(r.T, t0, r.T);
}
__device__ void ed25519_pack(unsigned char* __restrict__ c, const unsigned char* __restrict__ x, const unsigned char* __restrict__ y) {
unsigned char tmp[32];
unsigned char parity;
f25519_copy(tmp, x);
f25519_normalize(tmp);
parity = (tmp[0] & 1) << 7;
f25519_copy(c, y);
f25519_normalize(c);
c[31] |= parity;
// r.XYZ = p.XYZ * p.TZT
void __host__ __device__ ge_p1p1_to_p2(ge_p2& __restrict__ r, const ge_p1p1& __restrict__ p) {
fe_mul(r.X, p.X, p.T);
fe_mul(r.Y, p.Y, p.Z);
fe_mul(r.Z, p.Z, p.T);
}
__device__ __forceinline__ void ed25519_add(struct ed25519_pt* __restrict__ r, const struct ed25519_pt* __restrict__ p1, const struct ed25519_pt* __restrict__ p2) {
unsigned char a[32], b[32], c[32], d[32], e[32], f[32], g[32], h[32];
f25519_sub(c, p1->y, p1->x);
f25519_sub(d, p2->y, p2->x);
f25519_mul__distinct(a, c, d);
f25519_add(c, p1->y, p1->x);
f25519_add(d, p2->y, p2->x);
f25519_mul__distinct(b, c, d);
f25519_mul__distinct(d, p1->t, p2->t);
f25519_mul__distinct(c, d, ed25519_k);
f25519_mul__distinct(d, p1->z, p2->z);
f25519_add(d, d, d);
f25519_sub(e, b, a);
f25519_sub(f, d, c);
f25519_add(g, d, c);
f25519_add(h, b, a);
f25519_mul__distinct(r->x, e, f);
f25519_mul__distinct(r->y, g, h);
f25519_mul__distinct(r->t, e, h);
f25519_mul__distinct(r->z, f, g);
void inline __host__ __device__ ge_p1p1_to_p3(ge_p3& __restrict__ r, const ge_p1p1& __restrict__ p) {
fe_mul(r.X, p.X, p.T);
fe_mul(r.Y, p.Y, p.Z);
fe_mul(r.Z, p.Z, p.T);
fe_mul(r.T, p.X, p.Y);
}
__device__ __forceinline__ void ed25519_double(struct ed25519_pt* __restrict__ r, const struct ed25519_pt* __restrict__ p) {
unsigned char a[32], b[32], c[32], e[32], f[32], g[32], h[32];
f25519_mul__distinct(a, p->x, p->x);
f25519_mul__distinct(b, p->y, p->y);
f25519_mul__distinct(c, p->z, p->z);
f25519_add(c, c, c);
f25519_add(f, p->x, p->y);
f25519_mul__distinct(e, f, f);
f25519_sub(e, e, a);
f25519_sub(e, e, b);
f25519_sub(g, b, a);
f25519_sub(f, g, c);
f25519_neg(h, b);
f25519_sub(h, h, a);
f25519_mul__distinct(r->x, e, f);
f25519_mul__distinct(r->y, g, h);
f25519_mul__distinct(r->t, e, h);
f25519_mul__distinct(r->z, f, g);
void __host__ __device__ ge_p2_dbl(ge_p1p1& __restrict__ r, const ge_p2& __restrict__ p) {
fe t0;
fe_mul(r.X, p.X, p.X);
fe_mul(r.Z, p.Y, p.Y);
fe_mul2(r.T, p.Z);
fe_add(r.Y, p.X, p.Y);
fe_mul(t0, r.Y, r.Y);
fe_add(r.Y, r.Z, r.X);
fe_sub(r.Z, r.Z, r.X);
fe_sub(r.X, t0, r.Y);
fe_sub(r.T, r.T, r.Z);
}
__device__ __forceinline__ void ed25519_copy(struct ed25519_pt* __restrict__ dst, const struct ed25519_pt* __restrict__ src) {
f25519_copy(dst->x, src->x);
f25519_copy(dst->y, src->y);
f25519_copy(dst->t, src->t);
f25519_copy(dst->z, src->z);
void __host__ __device__ ge_p3_dbl(ge_p1p1& __restrict__ r, const ge_p3& __restrict__ p) {
ge_p2 q;
fe_copy(q.X, p.X);
fe_copy(q.Y, p.Y);
fe_copy(q.Z, p.Z);
ge_p2_dbl(r, q);
}
__device__ void ed25519_smult(ed25519_pt* r_out, const struct ed25519_pt* __restrict__ p, const unsigned char* __restrict__ e) {
ed25519_pt r = ed25519_neutral;
#pragma unroll 256
for (int i = 255; i >= 0; i--) {
struct ed25519_pt s;
ed25519_double(&r, &r);
ed25519_add(&s, &r, p);
unsigned char bit = (e[i >> 3] >> (i & 7)) & 1;
f25519_select(r.x, r.x, s.x, bit);
f25519_select(r.y, r.y, s.y, bit);
f25519_select(r.z, r.z, s.z, bit);
f25519_select(r.t, r.t, s.t, bit);
void __host__ __device__ ge_p3_tobytes(unsigned char* __restrict__ s, const ge_p3& __restrict__ h) {
fe recip, x, y;
fe_invert(recip, h.Z);
fe_mul(x, h.X, recip);
fe_mul(y, h.Y, recip);
fe_tobytes(s, y);
s[31] ^= fe_isnegative(x) << 7;
}
static constexpr unsigned char __host__ __device__ equal(const signed char b, const signed char c) {
unsigned long x = (b ^ c) - 1;
return static_cast<unsigned char>(x >>= 63);
}
static void __host__ __device__ cmov(precomp_data& __restrict__ t, const precomp_data& __restrict__ u, unsigned char b) {
fe_cmov(t.yplusx, u.yplusx, b);
fe_cmov(t.yminusx, u.yminusx, b);
fe_cmov(t.xy2d, u.xy2d, b);
}
static void __host__ __device__ select(precomp_data& __restrict__ t, signed int pos, signed char b) {
precomp_data minust;
unsigned long x = b;
x >>= 63;
unsigned char bnegative = static_cast<unsigned char>(x);
unsigned char babs = b - (((-bnegative) & b) << 1);
t.yplusx[0] = 1;
t.yminusx[0] = 1;
t.yplusx[1] = 0;
t.yminusx[1] = 0;
t.yplusx[2] = 0;
t.yminusx[2] = 0;
t.yplusx[3] = 0;
t.yminusx[3] = 0;
t.yplusx[4] = 0;
t.yminusx[4] = 0;
t.yplusx[5] = 0;
t.yminusx[5] = 0;
t.yplusx[6] = 0;
t.yminusx[6] = 0;
t.yplusx[7] = 0;
t.yminusx[7] = 0;
t.yplusx[8] = 0;
t.yminusx[8] = 0;
t.yplusx[9] = 0;
t.yminusx[9] = 0;
t.xy2d[0] = 0;
t.xy2d[1] = 0;
t.xy2d[2] = 0;
t.xy2d[3] = 0;
t.xy2d[4] = 0;
t.xy2d[5] = 0;
t.xy2d[6] = 0;
t.xy2d[7] = 0;
t.xy2d[8] = 0;
t.xy2d[9] = 0;
cmov(t, base[pos][0], equal(babs, 1));
cmov(t, base[pos][1], equal(babs, 2));
cmov(t, base[pos][2], equal(babs, 3));
cmov(t, base[pos][3], equal(babs, 4));
cmov(t, base[pos][4], equal(babs, 5));
cmov(t, base[pos][5], equal(babs, 6));
cmov(t, base[pos][6], equal(babs, 7));
cmov(t, base[pos][7], equal(babs, 8));
fe_copy(minust.yplusx, t.yminusx);
fe_copy(minust.yminusx, t.yplusx);
fe_neg(minust.xy2d, t.xy2d);
cmov(t, minust, bnegative);
}
void __device__ __host__ ge_scalarmult_base(ge_p3& __restrict__ h, const unsigned char* __restrict__ a) {
signed char e[64], carry;
signed int x;
ge_p1p1 r;
ge_p2 s;
precomp_data t;
#pragma unroll 32
for (signed int i = 0; i < 32; i++) {
e[2 * i] = a[i] & 15;
e[2 * i + 1] = a[i] >> 4;
}
#pragma unroll 63
for (x = 0, carry = 0; x < 63; x++) {
e[x] += carry;
carry = (e[x] + 8) >> 4;
e[x] -= carry << 4;
}
e[63] += carry;
fe_0(h.X);
fe_1(h.Y);
fe_1(h.Z);
fe_0(h.T);
#pragma unroll 32
for (int i = 1; i < 64; i += 2) {
select(t, i >> 1, e[i]);
ge_madd(r, h, t);
ge_p1p1_to_p3(h, r);
}
ge_p3_dbl(r, h);
ge_p1p1_to_p2(s, r);
ge_p2_dbl(r, s);
ge_p1p1_to_p2(s, r);
ge_p2_dbl(r, s);
ge_p1p1_to_p2(s, r);
ge_p2_dbl(r, s);
ge_p1p1_to_p3(h, r);
#pragma unroll 32
for (int i = 0; i < 64; i += 2) {
select(t, i >> 1, e[i]);
ge_madd(r, h, t);
ge_p1p1_to_p3(h, r);
}
ed25519_copy(r_out, &r);
}
__device__ void ed25519_prepare(unsigned char* __restrict__ e) {
e[0] &= 0xf8;
e[31] &= 0x7f;
e[31] |= 0x40;
}

View File

@@ -1,15 +1,22 @@
#ifndef __ED25519_CUH
#define __ED25519_CUH
struct ed25519_pt { unsigned char x[32], y[32], t[32], z[32]; };
extern __device__ __constant__ struct ed25519_pt ed25519_base;
extern __device__ __constant__ struct ed25519_pt ed25519_neutral;
extern __device__ __constant__ unsigned char ed25519_d[32];
extern __device__ __constant__ unsigned char ed25519_k[32];
__device__ void ed25519_unproject(unsigned char* x, unsigned char* y, const struct ed25519_pt* p);
__device__ void ed25519_pack(unsigned char* c, const unsigned char* x, const unsigned char* y);
__device__ __forceinline__ void ed25519_add(struct ed25519_pt* r, const struct ed25519_pt* p1, const struct ed25519_pt* p2);
__device__ __forceinline__ void ed25519_double(struct ed25519_pt* r, const struct ed25519_pt* p);
__device__ __forceinline__ void ed25519_copy(struct ed25519_pt* dst, const struct ed25519_pt* src);
__device__ void ed25519_smult(struct ed25519_pt* r_out, const struct ed25519_pt* p, const unsigned char* e);
__device__ void ed25519_prepare(unsigned char* e);
typedef struct {
signed long X[10], Y[10], Z[10];
} ge_p2;
typedef struct {
signed long X[10], Y[10], Z[10], T[10];
} ge_p3;
typedef struct {
signed long X[10], Y[10], Z[10], T[10];
} ge_p1p1;
typedef struct {
signed long yplusx[10], yminusx[10], xy2d[10];
} precomp_data;
void __host__ __device__ ge_p3_tobytes(unsigned char* __restrict__ s, const ge_p3& __restrict__ h);
void __host__ __device__ ge_madd(ge_p1p1* __restrict__ r, const ge_p3& __restrict__ p, const precomp_data& __restrict__ q);
void __host__ __device__ ge_scalarmult_base(ge_p3& __restrict__ h, const unsigned char* __restrict__ a);
void __host__ __device__ ge_p1p1_to_p2(ge_p2& __restrict__ r, const ge_p1p1& __restrict__ p);
void __host__ __device__ ge_p1p1_to_p3(ge_p3& __restrict__ r, const ge_p1p1& __restrict__ p);
void __host__ __device__ ge_p2_dbl(ge_p1p1& __restrict__ r, const ge_p2& __restrict__ p);
void __host__ __device__ ge_p3_dbl(ge_p1p1& __restrict__ r, const ge_p3& __restrict__ p);
#endif

View File

@@ -1,46 +1,20 @@
#include <edsign.cuh>
#include <ed25519.cuh>
#include <edsign.cuh>
#include <sha512.cuh>
__device__ __forceinline__ void expand_key(unsigned char* expanded, const unsigned char* secret) {
__device__ __forceinline__ void expand_key(unsigned char* __restrict__ expanded, const unsigned char* __restrict__ secret) {
struct sha512_state s;
memcpy(&s, &sha512_initial_state, sizeof(s));
sha512_final(&s, secret);
sha512_get(&s, expanded);
ed25519_prepare(expanded);
expanded[0] &= 0xf8;
expanded[31] = (expanded[31] & 0x7F) | 0x40;
}
__device__ __forceinline__ void pp(unsigned char* packed, const struct ed25519_pt* p) {
unsigned char x[32], y[32];
ed25519_unproject(x, y, p);
ed25519_pack(packed, x, y);
}
__device__ __forceinline__ void sm_pack(unsigned char* r, const unsigned char* k) {
struct ed25519_pt p;
ed25519_smult(&p, &ed25519_base, k);
pp(r, &p);
}
__device__ __forceinline__ void edsign_sec_to_pub(unsigned char* pub, const unsigned char* secret) {
__device__ void ed25519_create_keypair(unsigned char* __restrict__ private_key, unsigned char* __restrict__ public_key, unsigned char* __restrict__ seed) {
unsigned char expanded[64];
expand_key(expanded, secret);
sm_pack(pub, expanded);
}
__device__ __forceinline__ void compact_wipe(void* __restrict__ data) {
unsigned char* p = (unsigned char*)data;
unsigned long i = 0;
#pragma unroll
for (; i + 3 < 32; i += 4) {
p[i] = 0;
p[i + 1] = 0;
p[i + 2] = 0;
p[i + 3] = 0;
}
#pragma unroll
for (; i < 32; i++) {
p[i] = 0;
}
}
__device__ void ed25519_keygen(unsigned char private_key[64], unsigned char public_key[32], unsigned char random_seed[32]) {
edsign_sec_to_pub(public_key, random_seed);
memcpy(private_key, random_seed, 32);
expand_key(expanded, seed);
ge_p3 A;
ge_scalarmult_base(A, expanded);
ge_p3_tobytes(public_key, A);
memcpy(private_key, seed, 32);
memcpy(private_key + 32, public_key, 32);
compact_wipe(random_seed);
}

View File

@@ -1,9 +1,5 @@
#ifndef __EDSIGN_CUH
#define __EDSIGN_CUH
__device__ __forceinline__ void expand_key(unsigned char* expanded, const unsigned char* secret);
__device__ __forceinline__ void pp(unsigned char* packed, const struct ed25519_pt* p);
__device__ __forceinline__ void sm_pack(unsigned char* r, const unsigned char* k);
__device__ __forceinline__ void edsign_sec_to_pub(unsigned char* pub, const unsigned char* secret);
__device__ __forceinline__ void compact_wipe(void* __restrict__ data);
__device__ void ed25519_keygen(unsigned char private_key[64], unsigned char public_key[32], unsigned char random_seed[32]);
__device__ __forceinline__ void expand_key(unsigned char* __restrict__ expanded, const unsigned char* __restrict__ secret);
__device__ void ed25519_create_keypair(unsigned char* __restrict__ private_key, unsigned char* __restrict__ public_key, unsigned char* __restrict__ seed);
#endif

View File

@@ -1,137 +1,409 @@
#include <f25519.cuh>
#include <cuda_runtime.h>
__device__ void f25519_copy(unsigned char* __restrict__ x, const unsigned char* __restrict__ a) {
const uint4* src = reinterpret_cast<const uint4*>(a);
uint4* dst = reinterpret_cast<uint4*>(x);
dst[0] = src[0];
dst[1] = src[1];
// h = {0};
void __device__ __host__ fe_0(fe& __restrict__ h) {
h[0] = 0;
h[1] = 0;
h[2] = 0;
h[3] = 0;
h[4] = 0;
h[5] = 0;
h[6] = 0;
h[7] = 0;
h[8] = 0;
h[9] = 0;
}
__device__ void f25519_select(unsigned char* __restrict__ dst, const unsigned char* __restrict__ zero, const unsigned char* __restrict__ one, unsigned char cond) {
unsigned mask = static_cast<unsigned>(-cond);
uint4* d = reinterpret_cast<uint4*>(dst);
const uint4* z = reinterpret_cast<const uint4*>(zero);
const uint4* o = reinterpret_cast<const uint4*>(one);
uint4 res0, res1;
res0.x = (z[0].x & ~mask) | (o[0].x & mask);
res0.y = (z[0].y & ~mask) | (o[0].y & mask);
res0.z = (z[0].z & ~mask) | (o[0].z & mask);
res0.w = (z[0].w & ~mask) | (o[0].w & mask);
res1.x = (z[1].x & ~mask) | (o[1].x & mask);
res1.y = (z[1].y & ~mask) | (o[1].y & mask);
res1.z = (z[1].z & ~mask) | (o[1].z & mask);
res1.w = (z[1].w & ~mask) | (o[1].w & mask);
d[0] = res0;
d[1] = res1;
// h = {1,0,0,0,0,0,0,0,0,0,0};
void __device__ __host__ fe_1(fe& __restrict__ h) {
h[0] = 1;
h[1] = 0;
h[2] = 0;
h[3] = 0;
h[4] = 0;
h[5] = 0;
h[6] = 0;
h[7] = 0;
h[8] = 0;
h[9] = 0;
}
__device__ void f25519_normalize(unsigned char* __restrict__ x) {
__align__(32) unsigned char minusp[32];
unsigned c = (x[31] >> 7) * 19;
x[31] &= 127;
#pragma unroll
for (int i = 0; i < 32; i++) {
c += x[i];
x[i] = (unsigned char)c;
c >>= 8;
}
c = 19;
#pragma unroll
for (int i = 0; i < 31; i++) {
c += x[i];
minusp[i] = (unsigned char)c;
c >>= 8;
}
c += x[31] - 128;
minusp[31] = (unsigned char)c;
f25519_select(x, minusp, x, (c >> 15) & 1);
// h = f + g
void __device__ __host__ fe_add(fe& __restrict__ h, const fe& __restrict__ f, const fe& __restrict__ g) {
long f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4], f5 = f[5], f6 = f[6], f7 = f[7], f8 = f[8], f9 = f[9];
long g0 = g[0], g1 = g[1], g2 = g[2], g3 = g[3], g4 = g[4], g5 = g[5], g6 = g[6], g7 = g[7], g8 = g[8], g9 = g[9];
long h0 = f0 + g0, h1 = f1 + g1, h2 = f2 + g2, h3 = f3 + g3, h4 = f4 + g4, h5 = f5 + g5, h6 = f6 + g6, h7 = f7 + g7, h8 = f8 + g8, h9 = f9 + g9;
h[0] = h0;
h[1] = h1;
h[2] = h2;
h[3] = h3;
h[4] = h4;
h[5] = h5;
h[6] = h6;
h[7] = h7;
h[8] = h8;
h[9] = h9;
}
__device__ void f25519_add(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) {
unsigned c = 0;
#pragma unroll
for (int i = 0; i < 32; i++) {
c = (c >> 8) + ((unsigned)a[i]) + ((unsigned)b[i]);
r[i] = (unsigned char)c;
}
r[31] &= 127;
c = (c >> 7) * 19;
#pragma unroll
for (int i = 0; i < 32; i++) {
c += r[i];
r[i] = (unsigned char)c;
c >>= 8;
}
void __device__ __host__ fe_cmov(fe& __restrict__ f, const fe& __restrict__ g, const unsigned b) {
int x = -b;
long f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4], f5 = f[5], f6 = f[6], f7 = f[7], f8 = f[8], f9 = f[9];
long g0 = g[0], g1 = g[1], g2 = g[2], g3 = g[3], g4 = g[4], g5 = g[5], g6 = g[6], g7 = g[7], g8 = g[8], g9 = g[9];
long x0 = (f0 ^ g0) & x, x1 = (f1 ^ g1) & x, x2 = (f2 ^ g2) & x, x3 = (f3 ^ g3) & x, x4 = (f4 ^ g4) & x, x5 = (f5 ^ g5) & x, x6 = (f6 ^ g6) & x, x7 = (f7 ^ g7) & x, x8 = (f8 ^ g8) & x, x9 = (f9 ^ g9) & x;
f[0] = f0 ^ x0;
f[1] = f1 ^ x1;
f[2] = f2 ^ x2;
f[3] = f3 ^ x3;
f[4] = f4 ^ x4;
f[5] = f5 ^ x5;
f[6] = f6 ^ x6;
f[7] = f7 ^ x7;
f[8] = f8 ^ x8;
f[9] = f9 ^ x9;
}
__device__ void f25519_sub(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) {
unsigned c = 218;
#pragma unroll
for (int i = 0; i < 31; i++) {
c += 65280 + ((unsigned)a[i]) - ((unsigned)b[i]);
r[i] = (unsigned char)c;
c >>= 8;
}
c += ((unsigned)a[31]) - ((unsigned)b[31]);
r[31] = (unsigned char)(c & 127);
c = (c >> 7) * 19;
#pragma unroll
for (int i = 0; i < 32; i++) {
c += r[i];
r[i] = (unsigned char)c;
c >>= 8;
}
// h[0..9] = f[0..9]
void __device__ __host__ fe_copy(fe& __restrict__ h, const fe& __restrict__ f) {
long f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4], f5 = f[5], f6 = f[6], f7 = f[7], f8 = f[8], f9 = f[9];
h[0] = f0;
h[1] = f1;
h[2] = f2;
h[3] = f3;
h[4] = f4;
h[5] = f5;
h[6] = f6;
h[7] = f7;
h[8] = f8;
h[9] = f9;
}
__device__ void f25519_neg(unsigned char* __restrict__ r, const unsigned char* __restrict__ a) {
unsigned c = 218;
#pragma unroll
for (int i = 0; i < 31; i++) {
c += 65280 - ((unsigned)a[i]);
r[i] = (unsigned char)c;
c >>= 8;
void __device__ __host__ fe_invert(fe& __restrict__ out, const fe& __restrict__ z) {
fe t0, t1, t2, t3;
fe_mul(t0, z, z);
fe_mul(t1, t0, t0);
fe_mul(t1, t1, t1);
fe_mul(t1, z, t1);
fe_mul(t0, t0, t1);
fe_mul(t2, t0, t0);
fe_mul(t1, t1, t2);
fe_mul(t2, t1, t1);
fe_mul(t2, t2, t2);
fe_mul(t2, t2, t2);
fe_mul(t2, t2, t2);
fe_mul(t2, t2, t2);
fe_mul(t1, t2, t1);
fe_mul(t2, t1, t1);
fe_mul(t2, t2, t2);
fe_mul(t2, t2, t2);
fe_mul(t2, t2, t2);
fe_mul(t2, t2, t2);
fe_mul(t2, t2, t2);
fe_mul(t2, t2, t2);
fe_mul(t2, t2, t2);
fe_mul(t2, t2, t2);
fe_mul(t2, t2, t2);
fe_mul(t2, t2, t1);
fe_mul(t3, t2, t2);
fe_mul(t3, t3, t3);
fe_mul(t3, t3, t3);
fe_mul(t3, t3, t3);
fe_mul(t3, t3, t3);
fe_mul(t3, t3, t3);
fe_mul(t3, t3, t3);
fe_mul(t3, t3, t3);
fe_mul(t3, t3, t3);
fe_mul(t3, t3, t3);
fe_mul(t3, t3, t3);
fe_mul(t3, t3, t3);
fe_mul(t3, t3, t3);
fe_mul(t3, t3, t3);
fe_mul(t3, t3, t3);
fe_mul(t3, t3, t3);
fe_mul(t3, t3, t3);
fe_mul(t3, t3, t3);
fe_mul(t3, t3, t3);
fe_mul(t3, t3, t3);
fe_mul(t2, t3, t2);
fe_mul(t2, t2, t2);
fe_mul(t2, t2, t2);
fe_mul(t2, t2, t2);
fe_mul(t2, t2, t2);
fe_mul(t2, t2, t2);
fe_mul(t2, t2, t2);
fe_mul(t2, t2, t2);
fe_mul(t2, t2, t2);
fe_mul(t2, t2, t2);
fe_mul(t2, t2, t2);
fe_mul(t1, t2, t1);
fe_mul(t2, t1, t1);
#pragma unroll 49
for (int i = 0; i < 49; ++i) {
fe_mul(t2, t2, t2);
}
c -= ((unsigned)a[31]);
r[31] = (unsigned char)(c & 127);
c = (c >> 7) * 19;
#pragma unroll
for (int i = 0; i < 32; i++) {
c += r[i];
r[i] = (unsigned char)c;
c >>= 8;
fe_mul(t2, t2, t1);
fe_mul(t3, t2, t2);
#pragma unroll 99
for (int i = 0; i < 99; ++i) {
fe_mul(t3, t3, t3);
}
fe_mul(t2, t3, t2);
fe_mul(t2, t2, t2);
#pragma unroll 49
for (int i = 0; i < 49; ++i) {
fe_mul(t2, t2, t2);
}
fe_mul(t1, t2, t1);
fe_mul(t1, t1, t1);
fe_mul(t1, t1, t1);
fe_mul(t1, t1, t1);
fe_mul(t1, t1, t1);
fe_mul(t1, t1, t1);
fe_mul(out, t1, t0);
}
__device__ void f25519_mul__distinct(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b) {
unsigned c = 0;
#pragma unroll
for (int i = 0; i < 32; i++) {
c >>= 8;
for (int j = 0; j <= i; j++)
c += ((unsigned)a[j]) * ((unsigned)b[i - j]);
for (int j = i + 1; j < 32; j++)
c += ((unsigned)a[j]) * ((unsigned)b[32 + i - j]) * 38;
r[i] = (unsigned char)c;
}
r[31] &= 127;
c = (c >> 7) * 19;
#pragma unroll
for (int i = 0; i < 32; i++) {
c += r[i];
r[i] = (unsigned char)c;
c >>= 8;
}
int __device__ __host__ fe_isnegative(const fe& __restrict__ f) {
unsigned char s[32];
fe_tobytes(s, f);
return s[0] & 1;
}
__device__ void f25519_inv__distinct(unsigned char* __restrict__ r, const unsigned char* __restrict__ x) {
__align__(32) unsigned char s[32];
f25519_mul__distinct(s, x, x);
f25519_mul__distinct(r, s, x);
#pragma unroll
for (int i = 0; i < 248; i++) {
f25519_mul__distinct(s, r, r);
f25519_mul__distinct(r, s, x);
}
f25519_mul__distinct(s, r, r);
f25519_mul__distinct(r, s, s);
f25519_mul__distinct(s, r, x);
f25519_mul__distinct(r, s, s);
f25519_mul__distinct(s, r, r);
f25519_mul__distinct(r, s, x);
f25519_mul__distinct(s, r, r);
f25519_mul__distinct(r, s, x);
// h = f * g
void __device__ __host__ fe_mul(fe& __restrict__ h, const fe& __restrict__ f, const fe& __restrict__ g) {
long f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4], f5 = f[5], f6 = f[6], f7 = f[7], f8 = f[8], f9 = f[9];
long g0 = g[0], g1 = g[1], g2 = g[2], g3 = g[3], g4 = g[4], g5 = g[5], g6 = g[6], g7 = g[7], g8 = g[8], g9 = g[9];
long f0g0 = f0 * g0, f0g1 = f0 * g1, f0g2 = f0 * g2, f0g3 = f0 * g3, f0g4 = f0 * g4, f0g5 = f0 * g5, f0g6 = f0 * g6, f0g7 = f0 * g7, f0g8 = f0 * g8, f0g9 = f0 * g9;
long f1g0 = f1 * g0, f1g1_2 = f1 * g1 << 1L, f1g2 = f1 * g2, f1g3_2 = f1 * g3 << 1L, f1g4 = f1 * g4, f1g5_2 = f1 * g5 << 1L, f1g6 = f1 * g6, f1g7_2 = f1 * g7 << 1L, f1g8 = f1 * g8, f1g9_38 = f1 * g9 * 38L;
long f2g0 = f2 * g0, f2g1 = f2 * g1, f2g2 = f2 * g2, f2g3 = f2 * g3, f2g4 = f2 * g4, f2g5 = f2 * g5, f2g6 = f2 * g6, f2g7 = f2 * g7, f2g8_19 = f2 * g8 * 19L, f2g9_19 = f2 * g9 * 38L >> 1L;
long f3g0 = f3 * g0, f3g1_2 = f3 * g1 << 1L, f3g2 = f3 * g2, f3g3_2 = f3 * g3 << 1L, f3g4 = f3 * g4, f3g5_2 = f3 * g5 << 1L, f3g6 = f3 * g6, f3g7_38 = f3 * g7 * 38L, f3g8_19 = f3 * g8 * 19L, f3g9_38 = f3 * g9 * 38L;
long f4g0 = f4 * g0, f4g1 = f4 * g1, f4g2 = f4 * g2, f4g3 = f4 * g3, f4g4 = f4 * g4, f4g5 = f4 * g5, f4g6_19 = f4 * g6 * 19L, f4g7_19 = f4 * g7 * 38L >> 1L, f4g8_19 = f4 * g8 * 19L, f4g9_19 = f4 * g9 * 38L >> 1L;
long f5g0 = f5 * g0, f5g1_2 = f5 * g1 << 1L, f5g2 = f5 * g2, f5g3_2 = f5 * g3 << 1L, f5g4 = f5 * g4, f5g5_38 = f5 * g5 * 38L, f5g6_19 = f5 * g6 * 19L, f5g7_38 = f5 * g7 * 38L, f5g8_19 = f5 * g8 * 19L, f5g9_38 = f5 * g9 * 38L;
long f6g0 = f6 * g0, f6g1 = f6 * g1, f6g2 = f6 * g2, f6g3 = f6 * g3, f6g4_19 = f6 * g4 * 19L, f6g5_19 = f6 * g5 * 38L >> 1L, f6g6_19 = f6 * g6 * 19L, f6g7_19 = f6 * g7 * 38L >> 1L, f6g8_19 = f6 * g8 * 19L, f6g9_19 = f6 * g9 * 38L >> 1L;
long f7g0 = f7 * g0, f7g1_2 = f7 * g1 << 1L, f7g2 = f7 * g2, f7g3_38 = f7 * g3 * 38L, f7g4_19 = f7 * g4 * 19L, f7g5_38 = f7 * g5 * 38L, f7g6_19 = f7 * g6 * 19L, f7g7_38 = f7 * g7 * 38L, f7g8_19 = f7 * g8 * 19L, f7g9_38 = f7 * g9 * 38L;
long f8g0 = f8 * g0, f8g1 = f8 * g1, f8g2_19 = f8 * g2 * 19L, f8g3_19 = f8 * g3 * 38L >> 1L, f8g4_19 = f8 * g4 * 19L, f8g5_19 = f8 * g5 * 38L >> 1L, f8g6_19 = f8 * g6 * 19L, f8g7_19 = f8 * g7 * 38L >> 1L, f8g8_19 = f8 * g8 * 19L, f8g9_19 = f8 * g9 * 38L >> 1L;
long f9g0 = f9 * g0, f9g1_38 = f9 * g1 * 38L, f9g2_19 = f9 * g2 * 19L, f9g3_38 = f9 * g3 * 38L, f9g4_19 = f9 * g4 * 19L, f9g5_38 = f9 * g5 * 38L, f9g6_19 = f9 * g6 * 19L, f9g7_38 = f9 * g7 * 38L, f9g8_19 = f9 * g8 * 19L, f9g9_38 = f9 * g9 * 38L;
long h0 = f0g0 + f1g9_38 + f2g8_19 + f3g7_38 + f4g6_19 + f5g5_38 + f6g4_19 + f7g3_38 + f8g2_19 + f9g1_38;
long h1 = f0g1 + f1g0 + f2g9_19 + f3g8_19 + f4g7_19 + f5g6_19 + f6g5_19 + f7g4_19 + f8g3_19 + f9g2_19;
long h2 = f0g2 + f1g1_2 + f2g0 + f3g9_38 + f4g8_19 + f5g7_38 + f6g6_19 + f7g5_38 + f8g4_19 + f9g3_38;
long h3 = f0g3 + f1g2 + f2g1 + f3g0 + f4g9_19 + f5g8_19 + f6g7_19 + f7g6_19 + f8g5_19 + f9g4_19;
long h4 = f0g4 + f1g3_2 + f2g2 + f3g1_2 + f4g0 + f5g9_38 + f6g8_19 + f7g7_38 + f8g6_19 + f9g5_38;
long h5 = f0g5 + f1g4 + f2g3 + f3g2 + f4g1 + f5g0 + f6g9_19 + f7g8_19 + f8g7_19 + f9g6_19;
long h6 = f0g6 + f1g5_2 + f2g4 + f3g3_2 + f4g2 + f5g1_2 + f6g0 + f7g9_38 + f8g8_19 + f9g7_38;
long h7 = f0g7 + f1g6 + f2g5 + f3g4 + f4g3 + f5g2 + f6g1 + f7g0 + f8g9_19 + f9g8_19;
long h8 = f0g8 + f1g7_2 + f2g6 + f3g5_2 + f4g4 + f5g3_2 + f6g2 + f7g1_2 + f8g0 + f9g9_38;
long h9 = f0g9 + f1g8 + f2g7 + f3g6 + f4g5 + f5g4 + f6g3 + f7g2 + f8g1 + f9g0;
long carry = (h0 + 33554432L) >> 26L;
h1 += carry;
h0 -= carry << 26L;
carry = (h4 + 33554432L) >> 26L;
h5 += carry;
h4 -= carry << 26L;
carry = (h1 + 16777216L) >> 25L;
h2 += carry;
h1 -= carry << 25L;
carry = (h5 + 16777216L) >> 25L;
h6 += carry;
h5 -= carry << 25L;
carry = (h2 + 33554432L) >> 26L;
h3 += carry;
h2 -= carry << 26L;
carry = (h6 + 33554432L) >> 26L;
h7 += carry;
h6 -= carry << 26L;
carry = (h3 + 16777216L) >> 25L;
h4 += carry;
h3 -= carry << 25L;
carry = (h7 + 16777216L) >> 25L;
h8 += carry;
h7 -= carry << 25L;
carry = (h4 + 33554432L) >> 26L;
h5 += carry;
h4 -= carry << 26L;
carry = (h8 + 33554432L) >> 26L;
h9 += carry;
h8 -= carry << 26L;
carry = (h9 + 16777216L) >> 25L;
h0 += carry * 19L;
h9 -= carry << 25L;
carry = (h0 + 33554432L) >> 26L;
h1 += carry;
h0 -= carry << 26L;
h[0] = static_cast<int>(h0);
h[1] = static_cast<int>(h1);
h[2] = static_cast<int>(h2);
h[3] = static_cast<int>(h3);
h[4] = static_cast<int>(h4);
h[5] = static_cast<int>(h5);
h[6] = static_cast<int>(h6);
h[7] = static_cast<int>(h7);
h[8] = static_cast<int>(h8);
h[9] = static_cast<int>(h9);
}
// h = -f
void __host__ __device__ fe_neg(fe& __restrict__ h, const fe& __restrict__ f) {
long f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4], f5 = f[5], f6 = f[6], f7 = f[7], f8 = f[8], f9 = f[9];
long h0 = -f0, h1 = -f1, h2 = -f2, h3 = -f3, h4 = -f4, h5 = -f5, h6 = -f6, h7 = -f7, h8 = -f8, h9 = -f9;
h[0] = h0;
h[1] = h1;
h[2] = h2;
h[3] = h3;
h[4] = h4;
h[5] = h5;
h[6] = h6;
h[7] = h7;
h[8] = h8;
h[9] = h9;
}
void __host__ __device__ fe_mul2(fe& __restrict__ h, const fe& __restrict__ f) {
const long f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4], f5 = f[5], f6 = f[6], f7 = f[7], f8 = f[8], f9 = f[9];
long f0_2 = f0 << 1, f1_2 = f1 << 1, f2_2 = f2 << 1, f3_2 = f3 << 1, f4_2 = f4 << 1, f5_2 = f5 << 1, f6_2 = f6 << 1, f7_2 = f7 << 1;
long f5_38 = 38 * f5, f6_19 = 19 * f6, f7_38 = 38 * f7, f8_19 = 19 * f8, f9_38 = 38 * f9;
long f0f0 = f0 * f0, f0f1_2 = f0_2 * f1, f0f2_2 = f0_2 * f2, f0f3_2 = f0_2 * f3, f0f4_2 = f0_2 * f4, f0f5_2 = f0_2 * f5, f0f6_2 = f0_2 * f6, f0f7_2 = f0_2 * f7, f0f8_2 = f0_2 * f8, f0f9_2 = f0_2 * f9;
long f1f1_2 = f1_2 * f1, f1f2_2 = f1_2 * f2, f1f3_4 = f1_2 * f3_2, f1f4_2 = f1_2 * f4, f1f5_4 = f1_2 * f5_2, f1f6_2 = f1_2 * f6, f1f7_4 = f1_2 * f7_2, f1f8_2 = f1_2 * f8, f1f9_76 = f1_2 * f9_38;
long f2f2 = f2 * f2, f2f3_2 = f2_2 * f3, f2f4_2 = f2_2 * f4, f2f5_2 = f2_2 * f5, f2f6_2 = f2_2 * f6, f2f7_2 = f2_2 * f7, f2f8_38 = f2_2 * f8_19, f2f9_38 = f2 * f9_38;
long f3f3_2 = f3_2 * f3, f3f4_2 = f3_2 * f4, f3f5_4 = f3_2 * f5_2, f3f6_2 = f3_2 * f6, f3f7_76 = f3_2 * f7_38, f3f8_38 = f3_2 * f8_19, f3f9_76 = f3_2 * f9_38;
long f4f4 = f4 * f4, f4f5_2 = f4_2 * f5, f4f6_38 = f4_2 * f6_19, f4f7_38 = f4 * f7_38, f4f8_38 = f4_2 * f8_19, f4f9_38 = f4 * f9_38;
long f5f5_38 = f5 * f5_38, f5f6_38 = f5_2 * f6_19, f5f7_76 = f5_2 * f7_38, f5f8_38 = f5_2 * f8_19, f5f9_76 = f5_2 * f9_38;
long f6f6_19 = f6 * f6_19, f6f7_38 = f6 * f7_38, f6f8_38 = f6_2 * f8_19, f6f9_38 = f6 * f9_38;
long f7f7_38 = f7 * f7_38, f7f8_38 = f7_2 * f8_19, f7f9_76 = f7_2 * f9_38;
long f8f8_19 = f8 * f8_19, f8f9_38 = f8 * f9_38;
long f9f9_38 = f9 * f9_38;
long h0 = (f0f0 + f1f9_76 + f2f8_38 + f3f7_76 + f4f6_38 + f5f5_38) << 1;
long h1 = (f0f1_2 + f2f9_38 + f3f8_38 + f4f7_38 + f5f6_38) << 1;
long h2 = (f0f2_2 + f1f1_2 + f3f9_76 + f4f8_38 + f5f7_76 + f6f6_19) << 1;
long h3 = (f0f3_2 + f1f2_2 + f4f9_38 + f5f8_38 + f6f7_38) << 1;
long h4 = (f0f4_2 + f1f3_4 + f2f2 + f5f9_76 + f6f8_38 + f7f7_38) << 1;
long h5 = (f0f5_2 + f1f4_2 + f2f3_2 + f6f9_38 + f7f8_38) << 1;
long h6 = (f0f6_2 + f1f5_4 + f2f4_2 + f3f3_2 + f7f9_76 + f8f8_19) << 1;
long h7 = (f0f7_2 + f1f6_2 + f2f5_2 + f3f4_2 + f8f9_38) << 1;
long h8 = (f0f8_2 + f1f7_4 + f2f6_2 + f3f5_4 + f4f4 + f9f9_38) << 1;
long h9 = (f0f9_2 + f1f8_2 + f2f7_2 + f3f6_2 + f4f5_2) << 1;
long carry = (h0 + 33554432L) >> 26L;
h1 += carry;
h0 -= carry << 26L;
carry = (h4 + 33554432L) >> 26L;
h5 += carry;
h4 -= carry << 26L;
carry = (h1 + 16777216L) >> 25L;
h2 += carry;
h1 -= carry << 25L;
carry = (h5 + 16777216L) >> 25L;
h6 += carry;
h5 -= carry << 25L;
carry = (h2 + 33554432L) >> 26L;
h3 += carry;
h2 -= carry << 26L;
carry = (h6 + 33554432L) >> 26L;
h7 += carry;
h6 -= carry << 26L;
carry = (h3 + 16777216L) >> 25L;
h4 += carry;
h3 -= carry << 25L;
carry = (h7 + 16777216L) >> 25L;
h8 += carry;
h7 -= carry << 25L;
carry = (h4 + 33554432L) >> 26L;
h5 += carry;
h4 -= carry << 26L;
carry = (h8 + 33554432L) >> 26L;
h9 += carry;
h8 -= carry << 26L;
carry = (h9 + 16777216L) >> 25L;
h0 += carry * 19L;
h9 -= carry << 25L;
carry = (h0 + 33554432L) >> 26L;
h1 += carry;
h0 -= carry << 26L;
h[0] = static_cast<int>(h0);
h[1] = static_cast<int>(h1);
h[2] = static_cast<int>(h2);
h[3] = static_cast<int>(h3);
h[4] = static_cast<int>(h4);
h[5] = static_cast<int>(h5);
h[6] = static_cast<int>(h6);
h[7] = static_cast<int>(h7);
h[8] = static_cast<int>(h8);
h[9] = static_cast<int>(h9);
}
void __device__ __host__ fe_sub(fe& __restrict__ h, const fe& __restrict__ f, const fe& __restrict__ g) {
long f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3], f4 = f[4], f5 = f[5], f6 = f[6], f7 = f[7], f8 = f[8], f9 = f[9];
long g0 = g[0], g1 = g[1], g2 = g[2], g3 = g[3], g4 = g[4], g5 = g[5], g6 = g[6], g7 = g[7], g8 = g[8], g9 = g[9];
long h0 = f0 - g0, h1 = f1 - g1, h2 = f2 - g2, h3 = f3 - g3, h4 = f4 - g4, h5 = f5 - g5, h6 = f6 - g6, h7 = f7 - g7, h8 = f8 - g8, h9 = f9 - g9;
h[0] = h0;
h[1] = h1;
h[2] = h2;
h[3] = h3;
h[4] = h4;
h[5] = h5;
h[6] = h6;
h[7] = h7;
h[8] = h8;
h[9] = h9;
}
void __device__ __host__ fe_tobytes(unsigned char* __restrict__ s, const fe& __restrict__ h) {
long h0 = h[0], h1 = h[1], h2 = h[2], h3 = h[3], h4 = h[4], h5 = h[5], h6 = h[6], h7 = h[7], h8 = h[8], h9 = h[9], carry0, carry1, carry2, carry3, carry4, carry5, carry6, carry7, carry8, carry9, q;
q = (19 * h9 + (1 << 24)) >> 25;
q = (h0 + q) >> 26;
q = (h1 + q) >> 25;
q = (h2 + q) >> 26;
q = (h3 + q) >> 25;
q = (h4 + q) >> 26;
q = (h5 + q) >> 25;
q = (h6 + q) >> 26;
q = (h7 + q) >> 25;
q = (h8 + q) >> 26;
q = (h9 + q) >> 25;
h0 += 19 * q;
carry0 = h0 >> 26;
h1 += carry0;
h0 -= carry0 << 26;
carry1 = h1 >> 25;
h2 += carry1;
h1 -= carry1 << 25;
carry2 = h2 >> 26;
h3 += carry2;
h2 -= carry2 << 26;
carry3 = h3 >> 25;
h4 += carry3;
h3 -= carry3 << 25;
carry4 = h4 >> 26;
h5 += carry4;
h4 -= carry4 << 26;
carry5 = h5 >> 25;
h6 += carry5;
h5 -= carry5 << 25;
carry6 = h6 >> 26;
h7 += carry6;
h6 -= carry6 << 26;
carry7 = h7 >> 25;
h8 += carry7;
h7 -= carry7 << 25;
carry8 = h8 >> 26;
h9 += carry8;
h8 -= carry8 << 26;
carry9 = h9 >> 25;
h9 -= carry9 << 25;
s[0] = static_cast<unsigned char>(h0);
s[1] = static_cast<unsigned char>(h0 >> 8);
s[2] = static_cast<unsigned char>(h0 >> 16);
s[3] = static_cast<unsigned char>((h0 >> 24) | (h1 << 2));
s[4] = static_cast<unsigned char>(h1 >> 6);
s[5] = static_cast<unsigned char>(h1 >> 14);
s[6] = static_cast<unsigned char>((h1 >> 22) | (h2 << 3));
s[7] = static_cast<unsigned char>(h2 >> 5);
s[8] = static_cast<unsigned char>(h2 >> 13);
s[9] = static_cast<unsigned char>((h2 >> 21) | (h3 << 5));
s[10] = static_cast<unsigned char>(h3 >> 3);
s[11] = static_cast<unsigned char>(h3 >> 11);
s[12] = static_cast<unsigned char>((h3 >> 19) | (h4 << 6));
s[13] = static_cast<unsigned char>(h4 >> 2);
s[14] = static_cast<unsigned char>(h4 >> 10);
s[15] = static_cast<unsigned char>(h4 >> 18);
s[16] = static_cast<unsigned char>(h5);
s[17] = static_cast<unsigned char>(h5 >> 8);
s[18] = static_cast<unsigned char>(h5 >> 16);
s[19] = static_cast<unsigned char>((h5 >> 24) | (h6 << 1));
s[20] = static_cast<unsigned char>(h6 >> 7);
s[21] = static_cast<unsigned char>(h6 >> 15);
s[22] = static_cast<unsigned char>((h6 >> 23) | (h7 << 3));
s[23] = static_cast<unsigned char>(h7 >> 5);
s[24] = static_cast<unsigned char>(h7 >> 13);
s[25] = static_cast<unsigned char>((h7 >> 21) | (h8 << 4));
s[26] = static_cast<unsigned char>(h8 >> 4);
s[27] = static_cast<unsigned char>(h8 >> 12);
s[28] = static_cast<unsigned char>((h8 >> 20) | (h9 << 6));
s[29] = static_cast<unsigned char>(h9 >> 2);
s[30] = static_cast<unsigned char>(h9 >> 10);
s[31] = static_cast<unsigned char>(h9 >> 18);
}

View File

@@ -1,11 +1,16 @@
#ifndef __F25519_CUH
#define __F25519_CUH
__device__ void f25519_copy(unsigned char* __restrict__ x, const unsigned char* __restrict__ a);
__device__ void f25519_select(unsigned char* __restrict__ dst, const unsigned char* __restrict__ zero, const unsigned char* __restrict__ one, unsigned char cond);
__device__ void f25519_normalize(unsigned char* __restrict__ x);
__device__ void f25519_add(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b);
__device__ void f25519_sub(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b);
__device__ void f25519_neg(unsigned char* __restrict__ r, const unsigned char* __restrict__ a);
__device__ void f25519_mul__distinct(unsigned char* __restrict__ r, const unsigned char* __restrict__ a, const unsigned char* __restrict__ b);
__device__ void f25519_inv__distinct(unsigned char* __restrict__ r, const unsigned char* __restrict__ x);
#endif
using fe = signed long[10];
void __device__ __host__ fe_0(fe& __restrict__ h);
void __device__ __host__ fe_1(fe& __restrict__ h);
void __device__ __host__ fe_tobytes(unsigned char* __restrict__ s, const fe& __restrict__ h);
void __device__ __host__ fe_copy(fe& __restrict__ h, const fe& __restrict__ f);
int __device__ __host__ fe_isnegative(const fe& __restrict__ f);
void __device__ __host__ fe_cmov(fe& __restrict__ f, const fe& __restrict__ g, const unsigned b);
void __device__ __host__ fe_neg(fe& __restrict__ h, const fe& __restrict__ f);
void __device__ __host__ fe_add(fe& __restrict__ h, const fe& __restrict__ f, const fe& __restrict__ g);
void __device__ __host__ fe_invert(fe& __restrict__ out, const fe& __restrict__ z);
void __device__ __host__ fe_mul2(fe& __restrict__ h, const fe& __restrict__ f);
void __device__ __host__ fe_mul(fe& __restrict__ h, const fe& __restrict__ f, const fe& __restrict__ g);
void __device__ __host__ fe_sub(fe& __restrict__ h, const fe& __restrict__ f, const fe& __restrict__ g);
#endif

View File

@@ -1,27 +1,28 @@
#include <keymanip.cuh>
static __constant__ const char* hexDigits = "0123456789abcdef";
__device__ ds64 ktos(const unsigned char* key) noexcept {
ds64 str;
const char* hexDigits = "0123456789abcdef";
#pragma unroll 32
for (unsigned char i = 0; i < 32; i++) {
for (signed int i = 0; i < 32; i++) {
str.data[2 * i] = hexDigits[key[i] >> 4];
str.data[2 * i + 1] = hexDigits[key[i] & 0x0F];
}
str.data[65] = '\0';
return str;
}
__device__ ds46 getAddr(const unsigned char rawAddr[16]) noexcept {
__device__ ds46 getAddr(const Addr16 rawAddr) noexcept {
ds46 addrStr;
const char* hexDigits = "0123456789abcdef";
unsigned pos = 0;
#pragma unroll 8
for (unsigned char group = 0; group < 8; group++) {
int idx = group * 2;
for (signed int group = 0; group < 8; group++) {
int idx = group << 1;
addrStr.data[pos++] = hexDigits[rawAddr[idx] >> 4];
addrStr.data[pos++] = hexDigits[rawAddr[idx] & 0x0F];
addrStr.data[pos++] = hexDigits[rawAddr[idx + 1] >> 4];
addrStr.data[pos++] = hexDigits[rawAddr[idx + 1] & 0x0F];
if (group < 7) { addrStr.data[pos++] = ':'; }
if (group < 7) {
addrStr.data[pos++] = ':';
}
}
addrStr.data[pos] = '\0';
return addrStr;
@@ -31,7 +32,7 @@ __device__ void getRawAddress(int lErase, Key32& InvertedPublicKey, Addr16& rawA
const int bitsToShift = lErase & 7;
const int start = lErase >> 3;
if (bitsToShift) {
#pragma unroll
#pragma unroll
for (int i = start; i < start + 15; i++) {
InvertedPublicKey[i] = static_cast<unsigned char>((InvertedPublicKey[i] << bitsToShift) | (InvertedPublicKey[i + 1] >> (8 - bitsToShift)));
}
@@ -41,6 +42,9 @@ __device__ void getRawAddress(int lErase, Key32& InvertedPublicKey, Addr16& rawA
memcpy(&rawAddr[2], &InvertedPublicKey[start], 14);
}
__device__ void invertKey(const unsigned char* key, unsigned char* inverted) {
#pragma unroll 32
for (unsigned char i = 0; i < 32; i++) inverted[i] = key[i] ^ 0xFF;
#pragma unroll 8
for (int i = 0; i < 32; i += 4) {
uchar4 k = *(reinterpret_cast<const uchar4*>(&key[i]));
*(reinterpret_cast<uchar4*>(&inverted[i])) = make_uchar4(k.x ^ 0xFF, k.y ^ 0xFF, k.z ^ 0xFF, k.w ^ 0xFF);
}
}

View File

@@ -9,11 +9,10 @@ struct ds46 {
using Addr16 = unsigned char[16];
using Key32 = unsigned char[32];
struct KeysBox32 {
Key32 PublicKey;
Key32 PrivateKey;
Key32 PublicKey, PrivateKey;
};
__device__ ds64 ktos(const unsigned char* key) noexcept;
__device__ ds46 getAddr(const unsigned char rawAddr[16]) noexcept;
__device__ ds46 getAddr(const Addr16 rawAddr) noexcept;
__device__ void getRawAddress(int lErase, Key32& InvertedPublicKey, Addr16& rawAddr) noexcept;
__device__ void invertKey(const unsigned char* key, unsigned char* inverted);
#endif

1389
libs/precomp_data.h Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -1,51 +1,109 @@
#include <sha512.cuh>
__device__ __constant__ sha512_state sha512_initial_state = { {
0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL,
0x6a09e667f3bcc908ULL,
0xbb67ae8584caa73bULL,
0x3c6ef372fe94f82bULL,
0xa54ff53a5f1d36f1ULL,
0x510e527fade682d1ULL,
0x9b05688c2b3e6c1fULL,
0x1f83d9abfb41bd6bULL,
0x5be0cd19137e2179ULL,
} };
__device__ __constant__ unsigned long round_k[80] = {
0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, 0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL,
0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL, 0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL,
0xd807aa98a3030242ULL, 0x12835b0145706fbeULL, 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, 0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL,
0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL, 0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL,
0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL, 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, 0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL,
0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL, 0x06ca6351e003826fULL, 0x142929670a0e6e70ULL,
0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL, 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, 0x81c2c92e47edaee6ULL, 0x92722c851482353bULL,
0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL, 0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL,
0xd192e819d6ef5218ULL, 0xd69906245565a910ULL, 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, 0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL,
0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL, 0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL,
0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL, 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, 0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL,
0xca273eceea26619cULL, 0xd186b8c721c0c207ULL, 0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL,
0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL, 0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, 0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL,
0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL, 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL,
0x428a2f98d728ae22ULL,
0x7137449123ef65cdULL,
0xb5c0fbcfec4d3b2fULL,
0xe9b5dba58189dbbcULL,
0x3956c25bf348b538ULL,
0x59f111f1b605d019ULL,
0x923f82a4af194f9bULL,
0xab1c5ed5da6d8118ULL,
0xd807aa98a3030242ULL,
0x12835b0145706fbeULL,
0x243185be4ee4b28cULL,
0x550c7dc3d5ffb4e2ULL,
0x72be5d74f27b896fULL,
0x80deb1fe3b1696b1ULL,
0x9bdc06a725c71235ULL,
0xc19bf174cf692694ULL,
0xe49b69c19ef14ad2ULL,
0xefbe4786384f25e3ULL,
0x0fc19dc68b8cd5b5ULL,
0x240ca1cc77ac9c65ULL,
0x2de92c6f592b0275ULL,
0x4a7484aa6ea6e483ULL,
0x5cb0a9dcbd41fbd4ULL,
0x76f988da831153b5ULL,
0x983e5152ee66dfabULL,
0xa831c66d2db43210ULL,
0xb00327c898fb213fULL,
0xbf597fc7beef0ee4ULL,
0xc6e00bf33da88fc2ULL,
0xd5a79147930aa725ULL,
0x06ca6351e003826fULL,
0x142929670a0e6e70ULL,
0x27b70a8546d22ffcULL,
0x2e1b21385c26c926ULL,
0x4d2c6dfc5ac42aedULL,
0x53380d139d95b3dfULL,
0x650a73548baf63deULL,
0x766a0abb3c77b2a8ULL,
0x81c2c92e47edaee6ULL,
0x92722c851482353bULL,
0xa2bfe8a14cf10364ULL,
0xa81a664bbc423001ULL,
0xc24b8b70d0f89791ULL,
0xc76c51a30654be30ULL,
0xd192e819d6ef5218ULL,
0xd69906245565a910ULL,
0xf40e35855771202aULL,
0x106aa07032bbd1b8ULL,
0x19a4c116b8d2d0c8ULL,
0x1e376c085141ab53ULL,
0x2748774cdf8eeb99ULL,
0x34b0bcb5e19b48a8ULL,
0x391c0cb3c5c95a63ULL,
0x4ed8aa4ae3418acbULL,
0x5b9cca4f7763e373ULL,
0x682e6ff3d6b2b8a3ULL,
0x748f82ee5defb2fcULL,
0x78a5636f43172f60ULL,
0x84c87814a1f0ab72ULL,
0x8cc702081a6439ecULL,
0x90befffa23631e28ULL,
0xa4506cebde82bde9ULL,
0xbef9a3f7b2c67915ULL,
0xc67178f2e372532bULL,
0xca273eceea26619cULL,
0xd186b8c721c0c207ULL,
0xeada7dd6cde0eb1eULL,
0xf57d4f7fee6ed178ULL,
0x06f067aa72176fbaULL,
0x0a637dc5a2c898a6ULL,
0x113f9804bef90daeULL,
0x1b710b35131c471bULL,
0x28db77f523047d84ULL,
0x32caab7b40c72493ULL,
0x3c9ebe0a15c9bebcULL,
0x431d67c49c100d4cULL,
0x4cc5d4becb3e42b6ULL,
0x597f299cfc657e2aULL,
0x5fcb6fab3ad6faecULL,
0x6c44198c4a475817ULL,
};
__device__ __forceinline__ unsigned long load64(const unsigned char* __restrict__ x) {
return (static_cast<unsigned long>(x[0]) << 56) | (static_cast<unsigned long>(x[1]) << 48) |
(static_cast<unsigned long>(x[2]) << 40) | (static_cast<unsigned long>(x[3]) << 32) |
(static_cast<unsigned long>(x[4]) << 24) | (static_cast<unsigned long>(x[5]) << 16) |
(static_cast<unsigned long>(x[6]) << 8) | (static_cast<unsigned long>(x[7]));
return (static_cast<unsigned long>(x[0]) << 56) | (static_cast<unsigned long>(x[1]) << 48) | (static_cast<unsigned long>(x[2]) << 40) | (static_cast<unsigned long>(x[3]) << 32) | (static_cast<unsigned long>(x[4]) << 24) | (static_cast<unsigned long>(x[5]) << 16) | (static_cast<unsigned long>(x[6]) << 8) | (static_cast<unsigned long>(x[7]));
}
__device__ __forceinline__ void store64(unsigned char* __restrict__ x, unsigned long v) {
__device__ inline void store64(unsigned char* __restrict__ x, unsigned long v) {
#pragma unroll 8
for (int i = 0; i < 8; i++) {
x[i] = (unsigned char)(v >> (56 - i * 8));
}
for (unsigned char i = 0; i < 8; i++)
x[i] = static_cast<unsigned char>(v >> (56 - i * 8));
}
__device__ __forceinline__ unsigned long rot64(unsigned long x, int bits) {
return (x >> bits) | (x << (64 - bits));
}
__device__ __forceinline__ void sha512_block(sha512_state* __restrict__ s, const unsigned char* __restrict__ blk) {
#define rot64(x, bits) ((x >> bits) | (x << (64 - bits)))
__device__ void sha512_block(sha512_state* __restrict__ s, const unsigned char* __restrict__ blk) {
unsigned long w[16];
#pragma unroll 16
for (int i = 0; i < 16; i++) {
for (unsigned char i = 0; i < 16; i++) {
w[i] = load64(blk + i * 8);
}
unsigned long a = s->h[0];
@@ -57,7 +115,7 @@ __device__ __forceinline__ void sha512_block(sha512_state* __restrict__ s, const
unsigned long g = s->h[6];
unsigned long h = s->h[7];
#pragma unroll 80
for (int i = 0; i < 80; i++) {
for (unsigned char i = 0; i < 80; i++) {
const int idx = i & 15;
const int idx1 = (i + 1) & 15;
const int idx7 = (i + 9) & 15;
@@ -87,7 +145,7 @@ __device__ __forceinline__ void sha512_block(sha512_state* __restrict__ s, const
s->h[6] += g;
s->h[7] += h;
}
__device__ void sha512_final(sha512_state* s, const unsigned char* blk) {
__device__ void sha512_final(sha512_state* __restrict__ s, const unsigned char* __restrict__ blk) {
unsigned char temp[128];
unsigned long last_size = 32 & (128 - 1);
memcpy(temp, blk, last_size);
@@ -99,23 +157,23 @@ __device__ void sha512_final(sha512_state* s, const unsigned char* blk) {
store64(temp + 128 - 8, 256);
sha512_block(s, temp);
}
__device__ void sha512_get(const sha512_state* s, unsigned char* hash) {
__device__ void sha512_get(const sha512_state* __restrict__ s, unsigned char* __restrict__ hash) {
unsigned len = 64;
if (len > 128) len = 128;
if (len > 128)
len = 128;
unsigned i = 0, c = (len < 8) ? len : 8;
store64(hash, s->h[i]);
hash += c;
len -= c;
i++;
#pragma unroll
while (len >= 8) {
store64(hash, s->h[i]);
hash += 8;
len -= 8;
i++;
}
if (len > 0) {
unsigned char tmp[8];
store64(tmp, s->h[i]);
memcpy(hash, tmp, len);
}
unsigned char tmp[8];
store64(tmp, s->h[i]);
memcpy(hash, tmp, len);
}

View File

@@ -5,10 +5,9 @@ struct sha512_state {
};
extern __device__ __constant__ sha512_state sha512_initial_state;
extern __device__ __constant__ unsigned long round_k[80];
__device__ __forceinline__ unsigned long load64(const unsigned char* x);
__device__ __forceinline__ void store64(unsigned char* x, unsigned long v);
__device__ __forceinline__ unsigned long rot64(unsigned long x, int bits);
__device__ __forceinline__ void sha512_block(sha512_state* s, const unsigned char* blk);
__device__ void sha512_final(sha512_state* s, const unsigned char* blk);
__device__ void sha512_get(const sha512_state* s, unsigned char* hash);
__device__ __forceinline__ unsigned long load64(const unsigned char* __restrict__ x);
__device__ __forceinline__ void store64(unsigned char* __restrict__ x, unsigned long v);
__device__ __forceinline__ void sha512_block(sha512_state* __restrict__ s, const unsigned char* __restrict__ blk);
__device__ void sha512_final(sha512_state* __restrict__ s, const unsigned char* __restrict__ blk);
__device__ void sha512_get(const sha512_state* __restrict__ s, unsigned char* __restrict__ hash);
#endif

View File

@@ -1,18 +1,22 @@
__device__ int cstring_length(const char* s) {
int len = 0;
while (s[len]) len++;
while (s[len])
len++;
return len;
}
__device__ int cstring_find(const char* s, const char* sub) {
int i, j;
int n = cstring_length(s);
int m = cstring_length(sub);
if (m == 0) return 0;
if (m == 0)
return 0;
for (i = 0; i <= n - m; i++) {
for (j = 0; j < m; j++) {
if (s[i + j] != sub[j]) break;
if (s[i + j] != sub[j])
break;
}
if (j == m) return i;
if (j == m)
return i;
}
return -1;
}
@@ -22,7 +26,8 @@ __device__ int cstring_to_ull(const char* s, unsigned* val) {
if (s[0] == '0' && (s[1] == 'x' || s[1] == 'X')) {
i = 2;
}
if (s[i] == '\0') return 1;
if (s[i] == '\0')
return 1;
for (; s[i]; i++) {
char c = s[i];
int digit;
@@ -60,7 +65,8 @@ __device__ void concat(const char* s1, const char* s2, char* out, int outSize) {
}
while (s2[j] && i < outSize - 1) {
out[i] = s2[j];
i++; j++;
i++;
j++;
}
out[i] = '\0';
}

59
meson.build Normal file
View File

@@ -0,0 +1,59 @@
project(
'yggm',
['cpp', 'cuda'],
version: '20.08.2025',
default_options: ['cpp_std=c++20'],
)
cpp = meson.get_compiler('cpp')
cuda = meson.get_compiler('cuda')
common_cpp_flags = [
'-march=native',
'-ffast-math',
'-pipe',
'-funroll-loops',
'-Wpedantic',
'-Wconversion',
'-Wuninitialized',
'-Wsign-conversion',
'-masm=intel',
'-I../libs/',
]
common_cuda_flags = [
'-rdc=true',
'-Xptxas', '-O3',
'-use_fast_math',
'-ftz=true',
'-prec-div=false',
'-prec-sqrt=false',
'-gencode', 'arch=compute_75,code=sm_75',
'-Wno-deprecated-gpu-targets',
'--expt-relaxed-constexpr',
'-I../libs/',
]
add_project_arguments(common_cpp_flags, language: 'cpp')
add_project_arguments(common_cuda_flags, language: 'cuda')
if get_option('buildtype') == 'release'
add_project_arguments(['-DRELEASE'], language: ['cpp', 'cuda'])
elif get_option('buildtype') == 'debug'
add_project_arguments(['-DDEBUG'], language: ['cpp', 'cuda'])
endif
cpp_sources = ['sources/main.cpp'] + files(run_command('find', 'libs', '-name','*.h').stdout().split())
cuda_sources = ['sources/main.cu'] + files(run_command('find', 'libs', '-name', '*.cu').stdout().split())
executable(
'yggmcu',
sources: cuda_sources,
link_args: ['-lcudart', '-L/usr/local/cuda/lib64'],
)
executable(
'yggmc',
sources: cpp_sources,
dependencies: dependency('libsodium'),
)

4
native.ini Normal file
View File

@@ -0,0 +1,4 @@
[binaries]
cpp = 'clang++'
cuda = '/opt/cuda/bin/nvcc'
cuda_host_compiler = '/opt/cuda/bin/nvcc'

View File

@@ -1,24 +1,27 @@
#include <stdio.h>
#include <cuda_runtime.h>
#include <curand_kernel.h>
#include <sha512.cuh>
#include <defines.h>
#include <ed25519.cuh>
#include <edsign.cuh>
#include <string.cuh>
#include <keymanip.cuh>
__device__ unsigned d_high = 0x10;
#include <sha512.cuh>
#include <stdio.h>
#include <string.cuh>
__device__ unsigned d_high = 0x14;
__device__ int parameters(const char* arg) noexcept {
if ((cstring_find(arg, "--altitude") == 0 && cstring_length(arg) == 10) || (cstring_find(arg, "-a") == 0 && cstring_length(arg) == 2)) {
return 777;
}
int space_index = cstring_find(arg, " ");
if (space_index == -1) return 0;
if (space_index == -1)
return 0;
const int substr_start = space_index + 1;
char sub_arg[256];
extract_substring(arg, substr_start, sub_arg, 256);
if (cstring_find(arg, "--altitude") != -1 || cstring_find(arg, "-a") != -1) {
unsigned tmp_high;
if (cstring_to_ull(sub_arg, &tmp_high) != 0) return 1;
if (cstring_to_ull(sub_arg, &tmp_high) != 0)
return 1;
d_high = tmp_high;
}
return 0;
@@ -45,59 +48,93 @@ __global__ void args(char** argv, int argc, int* result) {
__device__ __forceinline__ unsigned char zeroCounter(unsigned int x) noexcept {
return x ? static_cast<unsigned char>(__clz(x)) : 32;
}
__device__ __forceinline__ unsigned char getZeros(const unsigned char* v) noexcept {
__device__ __forceinline__ unsigned char getZeros(const unsigned char* __restrict__ v) noexcept {
unsigned char leadZeros = 0;
#pragma unroll
for (int i = 0; i < 32; i += 4) {
unsigned word = (static_cast<unsigned>(v[i]) << 24) | (static_cast<unsigned>(v[i + 1]) << 16) | (static_cast<unsigned>(v[i + 2]) << 8) | (static_cast<unsigned>(v[i + 3]));
if (word == 0)
#pragma unroll 8
for (unsigned char i = 0; i < 32; i += 4) {
unsigned w = (static_cast<unsigned>(v[i]) << 24) | (static_cast<unsigned>(v[i + 1]) << 16) | (static_cast<unsigned>(v[i + 2]) << 8) | (static_cast<unsigned>(v[i + 3]));
if (w == 0)
leadZeros += 32;
else {
leadZeros += zeroCounter(word);
leadZeros += zeroCounter(w);
break;
}
}
return leadZeros;
}
__global__ void initRand(curandState* rs) {
__global__ void initRandSeed(curandState* __restrict__ states, const unsigned long seed) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
curand_init(seed, idx, 0, &states[idx]);
}
__global__ void initRand(curandState* rs, unsigned int* d_seeds) {
int id = blockIdx.x * blockDim.x + threadIdx.x;
curand_init(clock64() + id * 7919ULL, id, 0, &rs[id]);
#pragma unroll 10
for (int i = 0; i < 10; i++) {
curand(&rs[id]);
}
unsigned seed = curand(&rs[id]);
d_seeds[id] = seed;
}
__device__ __forceinline__ void rmbytes(unsigned char* buf, curandState* state) {
#pragma unroll 32
for (unsigned long i = 0; i < 32; i++) {
buf[i] = curand(state) & 0xFF;
int checkSeeds(unsigned* __restrict__ seeds, int count) {
for (int i = 0; i < count; i++) {
for (int j = i + 1; j < count; j++) {
if (seeds[i] == seeds[j]) {
return 1;
}
}
}
return 0;
}
__device__ __forceinline__ void rmbytes(unsigned char* __restrict__ buf, curandState& __restrict__ state) {
#pragma unroll 8
for (int i = 0; i < 8; i++) {
unsigned r = curand(&state);
buf[i * 4] = static_cast<unsigned char>(r & 0xFF);
buf[i * 4 + 1] = static_cast<unsigned char>((r >> 8) & 0xFF);
buf[i * 4 + 2] = static_cast<unsigned char>((r >> 16) & 0xFF);
buf[i * 4 + 3] = static_cast<unsigned char>((r >> 24) & 0xFF);
}
}
__global__ void KeyGen(curandState* randStates) {
curandState localState = randStates[blockIdx.x * blockDim.x + threadIdx.x];
while (true) {
KeysBox32 keys;
__global__ void KeyGenKernel(curandState* __restrict__ randStates) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
curandState localState = randStates[idx];
#ifdef DEBUG
unsigned x = 0;
#endif
while (WHCOND) {
Key32 seed;
rmbytes(seed, &localState);
ed25519_keygen(keys.PrivateKey, keys.PublicKey, seed);
if (unsigned zeros = getZeros(keys.PublicKey); zeros > atomicMax((unsigned*)&d_high, zeros)) {
KeysBox32 keys;
rmbytes(seed, localState);
ed25519_create_keypair(keys.PrivateKey, keys.PublicKey, seed);
if (unsigned char zeros = getZeros(keys.PublicKey); zeros > atomicMax(&d_high, zeros)) {
Addr16 raw;
Key32 inv;
invertKey(keys.PublicKey, inv);
getRawAddress(zeros, inv, raw);
#ifdef RELEASE
printf("\nIPv6:\t%s\nPK:\t%s\nSK:\t%s\n", getAddr(raw).data, ktos(keys.PublicKey).data, ktos(keys.PrivateKey).data);
#else
printf("\nIPv6:\t%s\nFK:\t%s%s\n", getAddr(raw).data, ktos(keys.PrivateKey).data, ktos(keys.PublicKey).data);
#endif
d_high = zeros;
}
#ifdef DEBUG
if ((++x & 0xFF) == 0) {
printf("\rIters: %d", x);
}
#endif
}
}
int main(int argc, char* argv[]) {
const int thPerBlock = 256;
int* d_result, mBpSM, h_high;
printf("BuildType: %s\n", __BUILDTYPE__);
int *d_result, mBpSM, h_high;
char** d_argv;
cudaDeviceProp prop;
curandState* rst;
cudaMalloc((void**)&d_result, sizeof(int));
cudaMalloc((void**)&d_argv, argc * sizeof(char*));
for (int i = 0; i < argc; i++) {
for (unsigned char i = 0; i < argc; i++) {
unsigned long len = strlen(argv[i]) + 1;
char* d_str;
cudaMalloc((void**)&d_str, len);
@@ -107,15 +144,29 @@ int main(int argc, char* argv[]) {
args<<<1, 1>>>(d_argv, argc, d_result);
cudaDeviceSynchronize();
cudaMemcpyFromSymbol(&h_high, d_high, sizeof(unsigned));
cudaGetDeviceProperties(&prop, 0);
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&mBpSM, KeyGen, thPerBlock, 0);
const int totalTh = mBpSM * prop.multiProcessorCount * thPerBlock;
printf("High addrs: 2%02x+\nSMs: %d\nMaxBlocksPerSM: %d\nTotalTh: %d\nBlocksThreads: %d:%d\n", h_high, prop.multiProcessorCount, mBpSM, totalTh, totalTh / thPerBlock, thPerBlock);
cudaMalloc(&rst, totalTh * sizeof(curandState));
initRand<<<totalTh / thPerBlock, thPerBlock>>>(rst);
cudaDeviceSynchronize();
KeyGen<<<totalTh / thPerBlock, thPerBlock>>>(rst);
cudaGetDeviceProperties_v2(&prop, 0);
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&mBpSM, KeyGenKernel, THREADSPB, 0);
const unsigned tTh = mBpSM * prop.multiProcessorCount * THREADSPB;
printf("High Addresses: 2%02x+\nSMs: %d\nTotalThreads: %d\nBlocks: %d (Threads: %d)\n", h_high, prop.multiProcessorCount, tTh, tTh / THREADSPB, THREADSPB);
cudaMalloc(&rst, tTh * sizeof(curandState));
unsigned* d_seeds;
cudaMalloc(&d_seeds, tTh * sizeof(unsigned));
initRand<<<THDIVTHPB, THREADSPB>>>(rst, d_seeds);
cudaDeviceSynchronize();
#ifndef DEBUG
unsigned* h_seeds = static_cast<unsigned*>(malloc(tTh * sizeof(unsigned)));
cudaMemcpy(h_seeds, d_seeds, tTh * sizeof(unsigned), cudaMemcpyDeviceToHost);
if (checkSeeds(h_seeds, tTh)) {
fprintf(stderr, "Error: DUPLICATE SEEDS FOUND!\n");
free(h_seeds);
cudaFree(d_seeds);
cudaFree(rst);
return 1;
}
free(h_seeds);
cudaFree(d_seeds);
#endif
KeyGenKernel<<<THDIVTHPB, THREADSPB>>>(rst);
cudaFree(rst);
return 0;
}

View File

@@ -1,14 +1,14 @@
#include <iostream>
#include <sstream>
#include <iomanip>
#include <vector>
#include <string>
#include <random>
#include <memory.h>
#include <thread>
#include <sodium.h>
#include "defines.h"
#include <arpa/inet.h>
#include <atomic>
#include <immintrin.h>
#include <iostream>
#include <memory.h>
#include <sodium.h>
#include <sstream>
#include <string>
#include <thread>
#include <vector>
struct option {
unsigned proc = 0;
std::atomic<unsigned> high = 0x14;
@@ -20,21 +20,18 @@ int parameters(std::string arg) {
std::istringstream ss(arg.substr(arg.find(" ") + 1));
if (arg.find("--threads") != npos || arg.find("-t") != npos) {
ss >> conf.proc;
if (ss.fail()) return 1;
if (ss.fail())
return 1;
return 0;
}
if (arg.find("--altitude") != npos || arg.find("-a") != npos) {
unsigned tmp_high;
ss >> std::hex >> tmp_high;
if (ss.fail()) return 1;
if (ss.fail())
return 1;
conf.high = tmp_high;
return 0;
}
/*
if (arg.find("--invert") != npos || arg.find("-i") != npos) {
return 0;
}
*/
}
if (arg == "--threads" || arg == "-t" || arg == "--altitude" || arg == "-a") {
return 777;
@@ -45,22 +42,20 @@ void displayConfig() {
unsigned processor_count = std::thread::hardware_concurrency();
if (conf.proc == 0 || conf.proc > static_cast<unsigned>(processor_count)) {
conf.proc = static_cast<unsigned>(processor_count);
}
printf("Threads: %u, high addresses (2%02x+)\n", conf.proc, conf.high.load());
}
using Address = unsigned char[16];
using Key = unsigned char[32];
inline std::string getAddress(const Address& rawAddr) noexcept {
inline std::string getAddress(const Address& rawAddr) {
char ipStrBuf[46];
inet_ntop(AF_INET6, rawAddr, ipStrBuf, 46);
return std::string(ipStrBuf);
}
inline std::string KeyToString(const unsigned char* key) noexcept {
inline std::string KeyToString(const unsigned char* key) {
char result[65];
const char* hexDigits = "0123456789abcdef";
#pragma unroll
for (unsigned char i = 0; i < 32; i++) {
for (int i = 0; i < 32; i++) {
result[2 * i] = hexDigits[key[i] >> 4];
result[2 * i + 1] = hexDigits[key[i] & 0x0F];
}
@@ -71,10 +66,10 @@ typedef struct alignas(32) {
Key PublicKey;
Key PrivateKey;
} KeysBox;
void getRawAddress(int lErase, Key& InvertedPublicKey, Address& rawAddr) noexcept {
void getRawAddress(unsigned lErase, Key& InvertedPublicKey, Address& rawAddr) {
lErase++;
const int bitsToShift = lErase % 8;
const int start = lErase / 8;
const int start = static_cast<int>(lErase / 8U);
if (bitsToShift != 0) {
for (int i = start; i < start + 15; i++) {
InvertedPublicKey[i] = static_cast<unsigned char>((InvertedPublicKey[i] << bitsToShift) | (InvertedPublicKey[i + 1] >> (8 - bitsToShift)));
@@ -84,79 +79,59 @@ void getRawAddress(int lErase, Key& InvertedPublicKey, Address& rawAddr) noexcep
rawAddr[1] = static_cast<unsigned char>(lErase - 1);
memcpy(&rawAddr[2], &InvertedPublicKey[start], 14);
}
inline void invertKey(const unsigned char* __restrict key, Key& inverted) noexcept {
inline void invertKey(const unsigned char* __restrict key, Key& inverted) {
#ifdef __AVX2__
_mm256_storeu_si256(reinterpret_cast<__m256i*>(inverted), _mm256_xor_si256(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(key)), _mm256_set1_epi8(0xFF)));
#else
for (int i = 0; i < 32; ++i) {
inverted[i] = static_cast<unsigned char>(key[i] ^ 0xFF);
}
#endif
}
[[nodiscard]] inline unsigned char zeroCounter(unsigned int x) noexcept {
return x == 0 ? 32 : static_cast<unsigned char>(__builtin_clz(x));
}
[[nodiscard]] inline unsigned char getZeros(const Key& v) noexcept {
unsigned char leadZeros = 0;
for (unsigned char i = 0; i < 32; i += 4) {
unsigned word = (static_cast<unsigned>(v[i]) << 24) | (static_cast<unsigned>(v[i + 1]) << 16) | (static_cast<unsigned>(v[i + 2]) << 8) | (static_cast<unsigned>(v[i + 3]));
inline unsigned getZeros(const Key& v) {
unsigned leadZeros = 0;
for (int i = 0; i < 32; i += 8) {
unsigned long long word = (static_cast<unsigned long long>(v[i]) << 56) | (static_cast<unsigned long long>(v[i + 1]) << 48) | (static_cast<unsigned long long>(v[i + 2]) << 40) | (static_cast<unsigned long long>(v[i + 3]) << 32) | (static_cast<unsigned long long>(v[i + 4]) << 24) | (static_cast<unsigned long long>(v[i + 5]) << 16) | (static_cast<unsigned long long>(v[i + 6]) << 8) | static_cast<unsigned long long>(v[i + 7]);
if (word == 0) {
leadZeros += 32;
leadZeros += 64;
} else {
leadZeros += zeroCounter(word);
leadZeros += static_cast<unsigned>(__builtin_clzll(word));
break;
}
}
return leadZeros;
}
[[nodiscard]] inline long long xorshift64(unsigned long& state) noexcept {
state ^= state << 21;
state ^= state >> 35;
state ^= state << 4;
return static_cast<long long>(state * 2685821657736338717);
}
inline void rmbytes(unsigned char* __restrict buf, unsigned long& state) noexcept {
for (unsigned char x = 0; x < 1; x++) {
_mm256_storeu_si256((__m256i*) & buf[x * 32], _mm256_set_epi64x(xorshift64(state), xorshift64(state), xorshift64(state), xorshift64(state)));
}
}
inline void sign_keypair(unsigned char* __restrict pk, unsigned char* __restrict sk, const unsigned char* __restrict seed) noexcept {
alignas(32) unsigned char h[64];
crypto_hash_sha512(h, seed, 32);
h[31] = (h[31] & 0xF8) | (0x40 | (h[31] & 0x7F));
crypto_scalarmult_ed25519_base(pk, h);
_mm256_storeu_si256(reinterpret_cast<__m256i*>(sk), _mm256_loadu_si256(reinterpret_cast<const __m256i*>(seed)));
_mm256_storeu_si256(reinterpret_cast<__m256i*>(sk + 32), _mm256_loadu_si256(reinterpret_cast<const __m256i*>(pk)));
}
void miner_thread() noexcept {
alignas(32) Key inv;
alignas(32) Key seed;
KeysBox keys;
Address rawAddr;
std::random_device rd;
unsigned long state = static_cast<unsigned long>(rd());
printf("Using seed: %lu\n", state);
void miner_thread() {
alignas(32) thread_local Key inv;
thread_local KeysBox keys;
thread_local Address rawAddr;
while (true) {
rmbytes(seed, state);
//sign_keypair(keys.PublicKey, keys.PrivateKey, seed);
crypto_sign_ed25519_seed_keypair(keys.PublicKey, keys.PrivateKey, seed);
if (unsigned char ones = getZeros(keys.PublicKey); ones > conf.high.load()) {
conf.high.store(ones);
crypto_sign_ed25519_keypair(keys.PublicKey, keys.PrivateKey);
unsigned ones = getZeros(keys.PublicKey), high = conf.high.load(std::memory_order_relaxed);
while (ones > high && !conf.high.compare_exchange_strong(high, ones, std::memory_order_relaxed)) {
high = conf.high.load(std::memory_order_relaxed);
}
if (ones > high) {
invertKey(keys.PublicKey, inv);
getRawAddress(ones, inv, rawAddr);
printf("\nIPv6:\t%s\nPK:\t%s\nSK:\t%s\n", getAddress(rawAddr).c_str(), KeyToString(keys.PublicKey).c_str(), KeyToString(keys.PrivateKey).c_str());
}
}
}
void startThreads() noexcept {
void startThreads() {
std::vector<std::thread> threads;
threads.reserve(conf.proc);
for (unsigned char x = 0; x < conf.proc; x++) {
for (unsigned x = 0; x < conf.proc; x++) {
threads.emplace_back(miner_thread);
}
for (auto& thread : threads) {
thread.join();
}
}
int main(int argc, char* argv[]) noexcept {
if (argc < 2) return 0;
for (int x = 1; x < argc; x++) {
int main(int argc, char* argv[]) {
std::cout << "BuildType: " << __BUILDTYPE__ << std::endl << "AVX2: " << __AVX2__ << std::endl;
if (argc < 1)
return 0;
for (int x = 0; x < argc; x++) {
if (int res = parameters(argv[x]); res == 777) {
if (++x >= argc) {
std::cerr << "Empty value for parameter \"" << argv[x - 1] << "\"" << std::endl;