vibecoded cuda patch for https://github.com/DavidBuchanan314/plcmine. use at your own risk, it worked for me on a 2070S but tweaks might be needed for other hardware
cuda.patch
edited
1diff --git a/native/Makefile b/native/Makefile
2index 2d63753..1d4c402 100644
3--- a/native/Makefile
4+++ b/native/Makefile
5@@ -2,6 +2,9 @@ all: mine
6
7 override CFLAGS += -march=native -mtune=native -O3 -flto -Wall -Wextra -Wpedantic -Wno-deprecated-declarations -Wno-format
8
9+NVCC ?= nvcc
10+NVCCFLAGS ?= -O3 -arch=sm_75 --ptxas-options=-v -DUSE_BITMAP_FILTER
11+
12 # make mine_nogmp CFLAGS=-DBENCHMARK
13
14 mine: mine.c util.h
15@@ -9,3 +12,6 @@ mine: mine.c util.h
16
17 mine_nogmp: mine_nogmp.c util.h bigint.h
18 $(CC) mine_nogmp.c -o mine_nogmp $(CFLAGS) -lcrypto
19+
20+mine_cuda: mine_cuda.cu util.h bigint.h
21+ $(NVCC) mine_cuda.cu -o mine_cuda $(NVCCFLAGS)
22diff --git a/native/mine_cuda b/native/mine_cuda
23new file mode 100755
24index 0000000..99850c0
25Binary files /dev/null and b/native/mine_cuda differ
26diff --git a/native/mine_cuda.cu b/native/mine_cuda.cu
27new file mode 100644
28index 0000000..fc22816
29--- /dev/null
30+++ b/native/mine_cuda.cu
31@@ -0,0 +1,590 @@
32+// mine_cuda.cu - CUDA port of mine_nogmp.c
33+// Based on the OpenCL port (ocl_mine.cl / ocl_mine.py)
34+//
35+// USAGE: ./mine_cuda <ignored_threads> precomputed.bin did:key:<pubkey> <prefix> [prefix2 ...]
36+// (thread count arg is accepted for CLI compatibility but ignored - GPU config is compile-time)
37+
38+#include <stdint.h>
39+#include <stdio.h>
40+#include <stdlib.h>
41+#include <string.h>
42+#include <assert.h>
43+#include <time.h>
44+#include <locale.h>
45+#include <cuda_runtime.h>
46+
47+// ---- tuning knobs ----
48+#ifndef STEPS_PER_TASK
49+#define STEPS_PER_TASK 512
50+#endif
51+#ifndef MAX_RESULTS
52+#define MAX_RESULTS 64
53+#endif
54+#ifndef WORK_SIZE
55+#define WORK_SIZE (1 << 20) // handles per kernel call
56+#endif
57+#ifndef BLOCK_SIZE
58+#define BLOCK_SIZE 256
59+#endif
60+
61+// ---- fixed offsets (matching mine_nogmp.c and ocl_mine.py) ----
62+#define PRESIGNED_LEN 155
63+#define PRESIGNED_HANDLE_OFF 55
64+#define PRESIGNED_PUBKEY_OFF 77
65+
66+#define SIGNED_LEN 247
67+#define SIGNED_SIG_OFF 7
68+#define SIGNED_HANDLE_OFF 147
69+#define SIGNED_PUBKEY_OFF 169
70+
71+#define RESULT_STRIDE 68 // handle(6)+pad(2)+row(4)+k_inv(32)+did_b32(24)
72+
73+#define MAX_PREFIXES 16
74+#define MAX_PREFIX_DATA 128
75+#define MASK26 0x03FFFFFFu
76+
77+// ---- constant memory ----
78+__constant__ uint8_t d_presigned_tpl[PRESIGNED_LEN];
79+__constant__ uint8_t d_signed_tpl[SIGNED_LEN];
80+__constant__ uint32_t d_firstbyte_bitmap[8]; // 256-bit bitmap for first-byte filter
81+__constant__ uint8_t d_firstbytes[MAX_PREFIXES];
82+__constant__ uint8_t d_prefix_data[MAX_PREFIX_DATA];
83+__constant__ uint32_t d_prefix_lens[MAX_PREFIXES];
84+__constant__ uint32_t d_prefix_offsets[MAX_PREFIXES];
85+__constant__ uint32_t d_num_prefixes;
86+
87+// ---- SHA-256 ----
88+__constant__ uint32_t K_SHA[64] = {
89+ 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
90+ 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
91+ 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
92+ 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
93+ 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
94+ 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
95+ 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
96+ 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
97+};
98+
99+static __device__ __forceinline__ uint32_t rotr32(uint32_t x, int n) {
100+ return (x >> n) | (x << (32 - n));
101+}
102+#define CH(x,y,z) ((z)^((x)&((y)^(z))))
103+#define MAJ(x,y,z) (((x)&(y))|((z)&((x)|(y))))
104+#define EP0(x) (rotr32(x,2)^rotr32(x,13)^rotr32(x,22))
105+#define EP1(x) (rotr32(x,6)^rotr32(x,11)^rotr32(x,25))
106+#define SIG0(x) (rotr32(x,7)^rotr32(x,18)^((x)>>3))
107+#define SIG1(x) (rotr32(x,17)^rotr32(x,19)^((x)>>10))
108+
109+static __device__ void sha256_compress(uint32_t state[8], uint32_t blk[16]) {
110+ uint32_t a=state[0],b=state[1],c=state[2],d=state[3];
111+ uint32_t e=state[4],f=state[5],g=state[6],h=state[7];
112+ #pragma unroll
113+ for (int i=0;i<16;i++){
114+ uint32_t t1=h+EP1(e)+CH(e,f,g)+K_SHA[i]+blk[i];
115+ uint32_t t2=EP0(a)+MAJ(a,b,c);
116+ h=g;g=f;f=e;e=d+t1;d=c;c=b;b=a;a=t1+t2;
117+ }
118+ #pragma unroll
119+ for (int i=16;i<64;i++){
120+ blk[i&15]=SIG1(blk[(i-2)&15])+blk[(i-7)&15]+SIG0(blk[(i-15)&15])+blk[(i-16)&15];
121+ uint32_t t1=h+EP1(e)+CH(e,f,g)+K_SHA[i]+blk[i&15];
122+ uint32_t t2=EP0(a)+MAJ(a,b,c);
123+ h=g;g=f;f=e;e=d+t1;d=c;c=b;b=a;a=t1+t2;
124+ }
125+ state[0]+=a;state[1]+=b;state[2]+=c;state[3]+=d;
126+ state[4]+=e;state[5]+=f;state[6]+=g;state[7]+=h;
127+}
128+
129+static __device__ void sha256_buf(const uint8_t *data, uint32_t len, uint32_t out[8]) {
130+ const uint32_t INIT[8] = {
131+ 0x6a09e667,0xbb67ae85,0x3c6ef372,0xa54ff53a,
132+ 0x510e527f,0x9b05688c,0x1f83d9ab,0x5be0cd19
133+ };
134+ uint32_t state[8];
135+ #pragma unroll
136+ for (int i=0;i<8;i++) state[i]=INIT[i];
137+
138+ uint32_t blk[16];
139+ uint32_t pos=0;
140+
141+ while (pos+64<=len) {
142+ #pragma unroll
143+ for (int w=0;w<16;w++){
144+ uint32_t b=pos+w*4;
145+ blk[w]=((uint32_t)data[b]<<24)|((uint32_t)data[b+1]<<16)
146+ |((uint32_t)data[b+2]<<8 )|((uint32_t)data[b+3]);
147+ }
148+ sha256_compress(state,blk);
149+ pos+=64;
150+ }
151+
152+ uint32_t rem=len-pos;
153+ #pragma unroll
154+ for (int w=0;w<16;w++) blk[w]=0;
155+ for (uint32_t b=0;b<rem;b++){
156+ uint32_t w=b>>2, sh=24-((b&3)<<3);
157+ blk[w]|=((uint32_t)data[pos+b])<<sh;
158+ }
159+ {uint32_t b=rem,w=b>>2,sh=24-((b&3)<<3);blk[w]|=(uint32_t)0x80u<<sh;}
160+
161+ if (rem<56){
162+ blk[15]=len*8;
163+ sha256_compress(state,blk);
164+ } else {
165+ sha256_compress(state,blk);
166+ #pragma unroll
167+ for (int w=0;w<16;w++) blk[w]=0;
168+ blk[15]=len*8;
169+ sha256_compress(state,blk);
170+ }
171+ #pragma unroll
172+ for (int i=0;i<8;i++) out[i]=state[i];
173+}
174+
175+// ---- 256-bit modular FMA (10x26-bit limbs, same as bigint.h) ----
176+__constant__ uint32_t N_LIMBS[10] = {3555649,9937716,33799165,60472610,45788892,67108863,67108863,67108863,67108863,4194303};
177+__constant__ uint32_t C_LIMBS[5] = {63553215,57171147,33309698,6636253,21319971};
178+
179+static __device__ void mod_fma(uint32_t res[10], const uint32_t a[10], const uint32_t b[10], const uint32_t c[10]) {
180+ uint64_t t[20]={0};
181+ uint32_t hi[10]={0};
182+
183+ #pragma unroll
184+ for (int i=0;i<10;i++)
185+ #pragma unroll
186+ for (int j=0;j<10;j++)
187+ t[i+j]+=(uint64_t)a[i]*(uint64_t)b[j];
188+
189+ t[10]+=t[9]>>26; t[9]&=MASK26;
190+ #pragma unroll
191+ for (int i=10;i<19;i++){t[i+1]+=t[i]>>26;hi[i-10]=(uint32_t)((t[i]&MASK26)<<4);t[i]=0;}
192+ hi[9]=(uint32_t)(t[19]<<4);
193+ for (int i=0;i<10;i++)
194+ #pragma unroll
195+ for (int j=0;j<5;j++)
196+ t[i+j]+=(uint64_t)hi[i]*(uint64_t)C_LIMBS[j];
197+
198+ t[10]+=t[9]>>26; t[9]&=MASK26;
199+ #pragma unroll
200+ for (int i=10;i<14;i++){t[i+1]+=t[i]>>26;hi[i-10]=(uint32_t)((t[i]&MASK26)<<4);t[i]=0;}
201+ hi[4]=(uint32_t)(t[14]<<4);
202+ for (int i=0;i<5;i++)
203+ #pragma unroll
204+ for (int j=0;j<5;j++)
205+ t[i+j]+=(uint64_t)hi[i]*(uint64_t)C_LIMBS[j];
206+
207+ #pragma unroll
208+ for (int i=0;i<10;i++) t[i]+=c[i];
209+ #pragma unroll
210+ for (int i=0;i<10;i++){t[i+1]+=t[i]>>26;t[i]&=MASK26;}
211+
212+ uint32_t ov=(uint32_t)((t[9]>>22)+(t[10]<<4));
213+ #pragma unroll
214+ for (int j=0;j<5;j++) t[j]+=(uint64_t)ov*(uint64_t)C_LIMBS[j];
215+ #pragma unroll
216+ for (int i=0;i<10;i++){t[i+1]+=t[i]>>26;t[i]&=MASK26;}
217+
218+ #pragma unroll
219+ for (int i=0;i<10;i++) res[i]=(uint32_t)t[i];
220+
221+ if (res[9]&(1u<<21)){
222+ #pragma unroll
223+ for (int i=0;i<10;i++) res[i]=N_LIMBS[i]-res[i];
224+ #pragma unroll
225+ for (int i=0;i<9;i++){res[i+1]-=res[i]>>31;res[i]&=MASK26;}
226+ }
227+}
228+
229+static __host__ __device__ void bigint_unpack(uint32_t res[10], const uint8_t buf[32]) {
230+ res[0]=((uint32_t)buf[31])|((uint32_t)buf[30]<<8)|((uint32_t)buf[29]<<16)|((uint32_t)(buf[28]&0x03)<<24);res[0]&=MASK26;
231+ res[1]=((uint32_t)(buf[28]>>2))|((uint32_t)buf[27]<<6)|((uint32_t)buf[26]<<14)|((uint32_t)(buf[25]&0x0F)<<22);res[1]&=MASK26;
232+ res[2]=((uint32_t)(buf[25]>>4))|((uint32_t)buf[24]<<4)|((uint32_t)buf[23]<<12)|((uint32_t)(buf[22]&0x3F)<<20);res[2]&=MASK26;
233+ res[3]=((uint32_t)(buf[22]>>6))|((uint32_t)buf[21]<<2)|((uint32_t)buf[20]<<10)|((uint32_t)buf[19]<<18);res[3]&=MASK26;
234+ res[4]=((uint32_t)buf[18])|((uint32_t)buf[17]<<8)|((uint32_t)buf[16]<<16)|((uint32_t)(buf[15]&0x03)<<24);res[4]&=MASK26;
235+ res[5]=((uint32_t)(buf[15]>>2))|((uint32_t)buf[14]<<6)|((uint32_t)buf[13]<<14)|((uint32_t)(buf[12]&0x0F)<<22);res[5]&=MASK26;
236+ res[6]=((uint32_t)(buf[12]>>4))|((uint32_t)buf[11]<<4)|((uint32_t)buf[10]<<12)|((uint32_t)(buf[9]&0x3F)<<20);res[6]&=MASK26;
237+ res[7]=((uint32_t)(buf[9]>>6))|((uint32_t)buf[8]<<2)|((uint32_t)buf[7]<<10)|((uint32_t)buf[6]<<18);res[7]&=MASK26;
238+ res[8]=((uint32_t)buf[5])|((uint32_t)buf[4]<<8)|((uint32_t)buf[3]<<16)|((uint32_t)(buf[2]&0x03)<<24);res[8]&=MASK26;
239+ res[9]=((uint32_t)(buf[2]>>2))|((uint32_t)buf[1]<<6)|((uint32_t)buf[0]<<14);res[9]&=0x003FFFFFu;
240+}
241+
242+static __device__ void bigint_pack(uint8_t buf[32], const uint32_t b[10]) {
243+ buf[31]=(b[0]>>0)&0xFF;buf[30]=(b[0]>>8)&0xFF;buf[29]=(b[0]>>16)&0xFF;buf[28]=(b[0]>>24)&0x03;
244+ buf[28]|=(b[1]&0x3F)<<2;buf[27]=(b[1]>>6)&0xFF;buf[26]=(b[1]>>14)&0xFF;buf[25]=(b[1]>>22)&0x0F;
245+ buf[25]|=(b[2]&0x0F)<<4;buf[24]=(b[2]>>4)&0xFF;buf[23]=(b[2]>>12)&0xFF;buf[22]=(b[2]>>20)&0x3F;
246+ buf[22]|=(b[3]&0x03)<<6;buf[21]=(b[3]>>2)&0xFF;buf[20]=(b[3]>>10)&0xFF;buf[19]=(b[3]>>18)&0xFF;
247+ buf[18]=(b[4]>>0)&0xFF;buf[17]=(b[4]>>8)&0xFF;buf[16]=(b[4]>>16)&0xFF;buf[15]=(b[4]>>24)&0x03;
248+ buf[15]|=(b[5]&0x3F)<<2;buf[14]=(b[5]>>6)&0xFF;buf[13]=(b[5]>>14)&0xFF;buf[12]=(b[5]>>22)&0x0F;
249+ buf[12]|=(b[6]&0x0F)<<4;buf[11]=(b[6]>>4)&0xFF;buf[10]=(b[6]>>12)&0xFF;buf[9]=(b[6]>>20)&0x3F;
250+ buf[9]|=(b[7]&0x03)<<6;buf[8]=(b[7]>>2)&0xFF;buf[7]=(b[7]>>10)&0xFF;buf[6]=(b[7]>>18)&0xFF;
251+ buf[5]=(b[8]>>0)&0xFF;buf[4]=(b[8]>>8)&0xFF;buf[3]=(b[8]>>16)&0xFF;buf[2]=(b[8]>>24)&0x03;
252+ buf[2]|=(b[9]&0x3F)<<2;buf[1]=(b[9]>>6)&0xFF;buf[0]=(b[9]>>14)&0xFF;
253+}
254+
255+// ---- base64url no-pad ----
256+__constant__ uint8_t B64[64] = {'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','0','1','2','3','4','5','6','7','8','9','-','_'};
257+
258+// Encode r_bytes[30..31] || s_bytes[0..31] (34 bytes) -> 46 chars
259+static __device__ void b64_raw_sig(uint8_t *out, uint8_t r0, uint8_t r1, const uint8_t *s) {
260+ out[0]=B64[(r0>>2)&0x3f];
261+ out[1]=B64[((r0<<4)|(r1>>4))&0x3f];
262+ out[2]=B64[((r1<<2)|(s[0]>>6))&0x3f];
263+ out[3]=B64[s[0]&0x3f];
264+ #pragma unroll
265+ for (int i=0;i<10;i++){
266+ uint8_t a=s[1+i*3],b=s[2+i*3],c=s[3+i*3];
267+ out[4+i*4]=B64[(a>>2)&0x3f];
268+ out[5+i*4]=B64[((a<<4)|(b>>4))&0x3f];
269+ out[6+i*4]=B64[((b<<2)|(c>>6))&0x3f];
270+ out[7+i*4]=B64[c&0x3f];
271+ }
272+ out[44]=B64[(s[31]>>2)&0x3f];
273+ out[45]=B64[(s[31]<<4)&0x3f];
274+}
275+
276+// ---- base32 multibase ----
277+__constant__ uint8_t B32[32] = {'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','2','3','4','5','6','7'};
278+
279+static __device__ void b32_encode(uint8_t *out, const uint8_t *data, uint32_t len) {
280+ uint32_t i=0;
281+ while (i+4<len){
282+ uint8_t a=data[i],b=data[i+1],c=data[i+2],d=data[i+3],e=data[i+4];i+=5;
283+ *out++=B32[(a>>3)&0x1f];*out++=B32[((a<<2)|(b>>6))&0x1f];
284+ *out++=B32[(b>>1)&0x1f];*out++=B32[((b<<4)|(c>>4))&0x1f];
285+ *out++=B32[((c<<1)|(d>>7))&0x1f];*out++=B32[(d>>2)&0x1f];
286+ *out++=B32[((d<<3)|(e>>5))&0x1f];*out++=B32[e&0x1f];
287+ }
288+ uint32_t rem=len-i;
289+ if(rem==4){uint8_t a=data[i],b=data[i+1],c=data[i+2],d=data[i+3];
290+ *out++=B32[(a>>3)&0x1f];*out++=B32[((a<<2)|(b>>6))&0x1f];
291+ *out++=B32[(b>>1)&0x1f];*out++=B32[((b<<4)|(c>>4))&0x1f];
292+ *out++=B32[((c<<1)|(d>>7))&0x1f];*out++=B32[(d>>2)&0x1f];*out++=B32[(d<<3)&0x1f];}
293+ else if(rem==3){uint8_t a=data[i],b=data[i+1],c=data[i+2];
294+ *out++=B32[(a>>3)&0x1f];*out++=B32[((a<<2)|(b>>6))&0x1f];
295+ *out++=B32[(b>>1)&0x1f];*out++=B32[((b<<4)|(c>>4))&0x1f];*out++=B32[(c<<1)&0x1f];}
296+ else if(rem==2){uint8_t a=data[i],b=data[i+1];
297+ *out++=B32[(a>>3)&0x1f];*out++=B32[((a<<2)|(b>>6))&0x1f];
298+ *out++=B32[(b>>1)&0x1f];*out++=B32[(b<<4)&0x1f];}
299+ else if(rem==1){uint8_t a=data[i];
300+ *out++=B32[(a>>3)&0x1f];*out++=B32[(a<<2)&0x1f];}
301+}
302+
303+// ---- main kernel ----
304+__global__ __launch_bounds__(BLOCK_SIZE)
305+void mine_plc(
306+ const uint32_t * __restrict__ limb_table,
307+ const uint8_t * __restrict__ r_b64_tbl,
308+ const uint8_t * __restrict__ r_tail,
309+ uint8_t * __restrict__ results,
310+ uint32_t * __restrict__ result_count,
311+ uint32_t handle_base,
312+ uint32_t row_base,
313+ uint32_t num_rows
314+) {
315+ uint32_t gid = blockIdx.x * blockDim.x + threadIdx.x;
316+ uint32_t handle_idx = handle_base + gid;
317+
318+ uint8_t handle[6];
319+ {uint32_t idx=handle_idx;for(int j=0;j<6;j++){handle[5-j]=B64[idx&0x3f];idx>>=6;}}
320+
321+ uint8_t presigned[PRESIGNED_LEN];
322+ #pragma unroll
323+ for (int i=0;i<PRESIGNED_LEN;i++) presigned[i]=d_presigned_tpl[i];
324+ for (int j=0;j<6;j++) presigned[PRESIGNED_HANDLE_OFF+j]=handle[j];
325+
326+ uint32_t z_state[8];
327+ sha256_buf(presigned, PRESIGNED_LEN, z_state);
328+
329+ uint8_t z_bytes[32];
330+ #pragma unroll
331+ for (int w=0;w<8;w++){
332+ z_bytes[w*4+0]=(z_state[w]>>24)&0xFF;z_bytes[w*4+1]=(z_state[w]>>16)&0xFF;
333+ z_bytes[w*4+2]=(z_state[w]>> 8)&0xFF;z_bytes[w*4+3]=(z_state[w] )&0xFF;
334+ }
335+ uint32_t z[10];
336+ bigint_unpack(z, z_bytes);
337+
338+ uint8_t signed_op[SIGNED_LEN];
339+ #pragma unroll
340+ for (int i=0;i<SIGNED_LEN;i++) signed_op[i]=d_signed_tpl[i];
341+ for (int j=0;j<6;j++) signed_op[SIGNED_HANDLE_OFF+j]=handle[j];
342+
343+ uint32_t row_end=row_base+STEPS_PER_TASK;
344+ if (row_end>num_rows) row_end=num_rows;
345+
346+ for (uint32_t row=row_base; row<row_end; row++) {
347+ uint32_t loff=row*20;
348+ uint32_t k_inv_rDa[10], k_inv_l[10];
349+ #pragma unroll
350+ for (int i=0;i<10;i++) k_inv_rDa[i]=limb_table[loff+i];
351+ #pragma unroll
352+ for (int i=0;i<10;i++) k_inv_l[i]=limb_table[loff+10+i];
353+
354+ uint32_t s[10];
355+ mod_fma(s, z, k_inv_l, k_inv_rDa);
356+
357+ uint8_t s_bytes[32];
358+ bigint_pack(s_bytes, s);
359+
360+ uint32_t rb64=row*40;
361+ #pragma unroll
362+ for (int b=0;b<40;b++) signed_op[SIGNED_SIG_OFF+b]=r_b64_tbl[rb64+b];
363+ b64_raw_sig(&signed_op[SIGNED_SIG_OFF+40], r_tail[row*2], r_tail[row*2+1], s_bytes);
364+
365+ uint32_t did_state[8];
366+ sha256_buf(signed_op, SIGNED_LEN, did_state);
367+
368+ uint8_t b0=(did_state[0]>>24)&0xFF;
369+#ifdef USE_BITMAP_FILTER
370+ if (!(d_firstbyte_bitmap[b0>>5]&(1u<<(b0&31)))) continue;
371+#else
372+ {int hit=0;for(uint32_t p=0;p<d_num_prefixes;p++)if(b0==d_firstbytes[p]){hit=1;break;}if(!hit)continue;}
373+#endif
374+
375+ uint8_t did_hash[15];
376+ #pragma unroll
377+ for (int b=0;b<15;b++){int w=b/4,sh=24-(b&3)*8;did_hash[b]=(did_state[w]>>sh)&0xFF;}
378+ uint8_t did_b32[24];
379+ b32_encode(did_b32, did_hash, 15);
380+
381+ for (uint32_t p=0;p<d_num_prefixes;p++){
382+ if (b0!=d_firstbytes[p]) continue;
383+ uint32_t plen=d_prefix_lens[p], poff=d_prefix_offsets[p];
384+ int ok=1;
385+ for (uint32_t ci=0;ci<plen;ci++) if(did_b32[ci]!=d_prefix_data[poff+ci]){ok=0;break;}
386+ if (!ok) continue;
387+
388+ uint32_t slot=atomicAdd(result_count, 1u);
389+ if (slot<MAX_RESULTS){
390+ uint32_t roff=slot*RESULT_STRIDE;
391+ for (int b=0;b<6;b++) results[roff+b]=handle[b];
392+ results[roff+6]=0;results[roff+7]=0;
393+ results[roff+8]=(row>>24)&0xFF;results[roff+9]=(row>>16)&0xFF;
394+ results[roff+10]=(row>>8)&0xFF;results[roff+11]=row&0xFF;
395+ uint8_t k_inv_bytes[32];
396+ bigint_pack(k_inv_bytes, k_inv_l);
397+ for (int b=0;b<32;b++) results[roff+12+b]=k_inv_bytes[b];
398+ for (int b=0;b<24;b++) results[roff+44+b]=did_b32[b];
399+ }
400+ }
401+ }
402+}
403+
404+// ---- host utilities ----
405+
406+static void cuda_check(cudaError_t err, const char *where) {
407+ if (err!=cudaSuccess){
408+ fprintf(stderr,"CUDA error at %s: %s\n",where,cudaGetErrorString(err));
409+ exit(1);
410+ }
411+}
412+#define CUDA_CHECK(x) cuda_check((x), #x)
413+
414+static double timestamp(void) {
415+ struct timespec ts;
416+ clock_gettime(CLOCK_REALTIME,&ts);
417+ return ts.tv_sec+ts.tv_nsec*1e-9;
418+}
419+
420+// load precomputed.bin -> device buffers
421+// each row = [r(32), k_inv_rDa(32), k_inv(32)] = 96 bytes
422+static void load_precomputed(const char *path,
423+ uint32_t **d_limb_table, uint8_t **d_r_b64, uint8_t **d_r_tail,
424+ size_t *num_rows_out)
425+{
426+ static const char b64c[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
427+
428+ FILE *f=fopen(path,"rb");
429+ if (!f){fprintf(stderr,"cannot open %s\n",path);exit(1);}
430+ fseek(f,0,SEEK_END);
431+ long sz=ftell(f);rewind(f);
432+ assert(sz%96==0);
433+ size_t num_rows=sz/96;
434+ uint8_t *raw=(uint8_t*)malloc(sz);
435+ assert(fread(raw,sz,1,f)==1);
436+ fclose(f);
437+
438+ uint32_t *limb_table=(uint32_t*)malloc(num_rows*20*sizeof(uint32_t));
439+ uint8_t *r_b64 =(uint8_t *)malloc(num_rows*40);
440+ uint8_t *r_tail =(uint8_t *)malloc(num_rows*2);
441+
442+ for (size_t i=0;i<num_rows;i++){
443+ uint8_t *row=raw+i*96;
444+ bigint_unpack(limb_table+i*20, row+32); // k_inv_rDa
445+ bigint_unpack(limb_table+i*20+10, row+64); // k_inv
446+
447+ // base64url-encode r[0..29] (exactly 30 bytes = 10 groups of 3 -> 40 chars, no remainder)
448+ for (int g=0;g<10;g++){
449+ uint8_t a=row[g*3],b=row[g*3+1],c=row[g*3+2];
450+ r_b64[i*40+g*4+0]=b64c[(a>>2)&0x3f];
451+ r_b64[i*40+g*4+1]=b64c[((a<<4)|(b>>4))&0x3f];
452+ r_b64[i*40+g*4+2]=b64c[((b<<2)|(c>>6))&0x3f];
453+ r_b64[i*40+g*4+3]=b64c[c&0x3f];
454+ }
455+ r_tail[i*2 ]=row[30];
456+ r_tail[i*2+1]=row[31];
457+ }
458+ free(raw);
459+
460+ CUDA_CHECK(cudaMalloc(d_limb_table, num_rows*20*sizeof(uint32_t)));
461+ CUDA_CHECK(cudaMalloc(d_r_b64, num_rows*40));
462+ CUDA_CHECK(cudaMalloc(d_r_tail, num_rows*2));
463+ CUDA_CHECK(cudaMemcpy(*d_limb_table, limb_table, num_rows*20*sizeof(uint32_t), cudaMemcpyHostToDevice));
464+ CUDA_CHECK(cudaMemcpy(*d_r_b64, r_b64, num_rows*40, cudaMemcpyHostToDevice));
465+ CUDA_CHECK(cudaMemcpy(*d_r_tail, r_tail, num_rows*2, cudaMemcpyHostToDevice));
466+ free(limb_table);free(r_b64);free(r_tail);
467+ *num_rows_out=num_rows;
468+}
469+
470+static uint8_t prefix_firstbyte(const char *p) {
471+ static const char *b32c="abcdefghijklmnopqrstuvwxyz234567";
472+ int c0=(int)(strchr(b32c,p[0])-b32c);
473+ int c1=(int)(strchr(b32c,p[1])-b32c);
474+ return (uint8_t)((c0<<3)|(c1>>2));
475+}
476+
477+int main(int argc, char *argv[]) {
478+ if (argc<5){
479+ fprintf(stderr,"USAGE: %s <threads_ignored> precomputed.bin did:key:<pubkey> <prefix> [prefix2 ...]\n",argv[0]);
480+ return 1;
481+ }
482+ const char *precomputed_path=argv[2];
483+ const char *pubkey=argv[3];
484+ int num_prefixes=argc-4;
485+ char **prefixes=argv+4;
486+
487+ if (strlen(pubkey)!=57){fprintf(stderr,"invalid pubkey length\n");return 1;}
488+ for (int i=0;i<num_prefixes;i++){
489+ if (strlen(prefixes[i])<2){fprintf(stderr,"prefix too short\n");return 1;}
490+ if (strlen(prefixes[i])>8){fprintf(stderr,"prefix too long (>8)\n");return 1;}
491+ }
492+ if (num_prefixes>MAX_PREFIXES){fprintf(stderr,"too many prefixes (max %d)\n",MAX_PREFIXES);return 1;}
493+
494+ // ---- build templates ----
495+ uint8_t presigned_tpl[PRESIGNED_LEN];
496+ {
497+ char buf[256];
498+ int n=snprintf(buf,sizeof(buf),
499+ "\xa6""dprev\xf6""dtypemplc_operationhservices\xa0""kalsoKnownAs\x81""kat://AAAAAAlrotationKeys\x81""x9%ssverificationMethods\xa0",
500+ pubkey);
501+ assert(n==PRESIGNED_LEN);
502+ memcpy(presigned_tpl,buf,PRESIGNED_LEN);
503+ }
504+
505+ uint8_t signed_tpl[SIGNED_LEN];
506+ {
507+ const uint8_t header[] = "\xa7""csigxV"; // 7 bytes
508+ const uint8_t mid[] = "dprev\xf6""dtypemplc_operationhservices\xa0""kalsoKnownAs\x81""kat://AAAAAAlrotationKeys\x81""x9";
509+ const uint8_t tail[] = "sverificationMethods\xa0"; // 21 bytes
510+ uint8_t *p=signed_tpl;
511+ memcpy(p,header,7); p+=7;
512+ memset(p,'A',86); p+=86; // sig placeholder
513+ memcpy(p,mid,sizeof(mid)-1); p+=sizeof(mid)-1;
514+ assert((p-signed_tpl)==SIGNED_PUBKEY_OFF);
515+ memcpy(p,pubkey,57); p+=57;
516+ memcpy(p,tail,21); p+=21;
517+ assert((p-signed_tpl)==SIGNED_LEN);
518+ }
519+
520+ // ---- upload templates to constant memory ----
521+ CUDA_CHECK(cudaMemcpyToSymbol(d_presigned_tpl, presigned_tpl, PRESIGNED_LEN));
522+ CUDA_CHECK(cudaMemcpyToSymbol(d_signed_tpl, signed_tpl, SIGNED_LEN));
523+
524+ // ---- upload prefix data ----
525+ {
526+ uint8_t fb[MAX_PREFIXES]={0};
527+ uint8_t pdata[MAX_PREFIX_DATA]={0};
528+ uint32_t plens[MAX_PREFIXES]={0};
529+ uint32_t poffs[MAX_PREFIXES]={0};
530+ uint32_t off=0;
531+ for (int i=0;i<num_prefixes;i++){
532+ fb[i]=prefix_firstbyte(prefixes[i]);
533+ plens[i]=(uint32_t)strlen(prefixes[i]);
534+ poffs[i]=off;
535+ memcpy(pdata+off,prefixes[i],plens[i]);
536+ off+=plens[i];
537+ }
538+ uint32_t bitmap[8]={0};
539+ for (int i=0;i<num_prefixes;i++) bitmap[fb[i]>>5]|=(1u<<(fb[i]&31));
540+ uint32_t np=(uint32_t)num_prefixes;
541+ CUDA_CHECK(cudaMemcpyToSymbol(d_firstbyte_bitmap, bitmap, sizeof(bitmap)));
542+ CUDA_CHECK(cudaMemcpyToSymbol(d_firstbytes, fb, sizeof(uint8_t)*num_prefixes));
543+ CUDA_CHECK(cudaMemcpyToSymbol(d_prefix_data, pdata, off));
544+ CUDA_CHECK(cudaMemcpyToSymbol(d_prefix_lens, plens, sizeof(uint32_t)*num_prefixes));
545+ CUDA_CHECK(cudaMemcpyToSymbol(d_prefix_offsets,poffs, sizeof(uint32_t)*num_prefixes));
546+ CUDA_CHECK(cudaMemcpyToSymbol(d_num_prefixes, &np, sizeof(uint32_t)));
547+ }
548+
549+ // ---- load precomputed table ----
550+ uint32_t *d_limb_table; uint8_t *d_r_b64, *d_r_tail;
551+ size_t num_rows;
552+ load_precomputed(precomputed_path, &d_limb_table, &d_r_b64, &d_r_tail, &num_rows);
553+ fprintf(stderr,"imported %zu rows, WORK_SIZE=%d STEPS_PER_TASK=%d\n",num_rows,WORK_SIZE,STEPS_PER_TASK);
554+
555+ // ---- allocate result buffers ----
556+ uint8_t *d_results;
557+ uint32_t *d_result_count;
558+ uint8_t h_results[MAX_RESULTS*RESULT_STRIDE];
559+ uint32_t h_result_count;
560+ CUDA_CHECK(cudaMalloc(&d_results, MAX_RESULTS*RESULT_STRIDE));
561+ CUDA_CHECK(cudaMalloc(&d_result_count, sizeof(uint32_t)));
562+
563+ // ---- mining loop ----
564+ CUDA_CHECK(cudaFuncSetAttribute(mine_plc, cudaFuncAttributePreferredSharedMemoryCarveout, 0));
565+
566+ setlocale(LC_NUMERIC,"");
567+ uint64_t total_plcs=0, total_found=0;
568+ double start=timestamp();
569+ uint64_t handle_base=0;
570+ int blocks=WORK_SIZE/BLOCK_SIZE;
571+
572+ while (1) {
573+ for (uint32_t row_base=0; row_base<(uint32_t)num_rows; row_base+=STEPS_PER_TASK) {
574+
575+ h_result_count=0;
576+ CUDA_CHECK(cudaMemcpy(d_result_count,&h_result_count,sizeof(uint32_t),cudaMemcpyHostToDevice));
577+
578+ mine_plc<<<blocks,BLOCK_SIZE>>>(
579+ d_limb_table, d_r_b64, d_r_tail,
580+ d_results, d_result_count,
581+ (uint32_t)handle_base, row_base, (uint32_t)num_rows
582+ );
583+ CUDA_CHECK(cudaGetLastError());
584+ CUDA_CHECK(cudaDeviceSynchronize());
585+
586+ CUDA_CHECK(cudaMemcpy(&h_result_count,d_result_count,sizeof(uint32_t),cudaMemcpyDeviceToHost));
587+
588+ uint32_t n=h_result_count<MAX_RESULTS?h_result_count:MAX_RESULTS;
589+ if (n>0){
590+ CUDA_CHECK(cudaMemcpy(h_results,d_results,n*RESULT_STRIDE,cudaMemcpyDeviceToHost));
591+ for (uint32_t i=0;i<n;i++){
592+ uint8_t *r=h_results+i*RESULT_STRIDE;
593+ char handle[7]; memcpy(handle,r,6); handle[6]=0;
594+ uint32_t row=((uint32_t)r[8]<<24)|((uint32_t)r[9]<<16)|((uint32_t)r[10]<<8)|r[11];
595+ char did_b32[25]; memcpy(did_b32,r+44,24); did_b32[24]=0;
596+ printf("%s %s 0x",did_b32,handle);
597+ for (int b=0;b<32;b++) printf("%02x",r[12+b]);
598+ printf("\n");
599+ fflush(stdout);
600+ total_found++;
601+ (void)row;
602+ }
603+ }
604+
605+ uint32_t rows_this_call=(row_base+STEPS_PER_TASK<(uint32_t)num_rows)
606+ ?STEPS_PER_TASK:(uint32_t)num_rows-row_base;
607+ total_plcs+=(uint64_t)WORK_SIZE*rows_this_call;
608+
609+ double dur=timestamp()-start;
610+ double rate=total_plcs/1e6/dur;
611+ fprintf(stderr,"\tStats: %'llu PLCs in %.3fs (%.1fM/s avg) Found: %llu\r",
612+ (unsigned long long)total_plcs, dur, rate, (unsigned long long)total_found);
613+ }
614+ handle_base+=WORK_SIZE;
615+ if (handle_base >= (1ULL<<32)) {
616+ fprintf(stderr,"\nhandle space exhausted (2^32 handles checked)\n");
617+ goto done;
618+ }
619+ }
620+ done:;
621+}