update

Changed files
+344 -444
internal
+319 -437
internal/plc/bundle.go
··· 17 17 "github.com/klauspost/compress/zstd" 18 18 ) 19 19 20 - // BUNDLE_SIZE is the number of operations per bundle 21 20 const BUNDLE_SIZE = 10000 22 21 23 22 type BundleManager struct { ··· 27 26 decoder *zstd.Decoder 28 27 db storage.Database 29 28 } 29 + 30 + // ===== INITIALIZATION ===== 30 31 31 32 func NewBundleManager(dir string, enabled bool, db storage.Database) (*BundleManager, error) { 32 33 if !enabled { ··· 49 50 50 51 return &BundleManager{ 51 52 dir: dir, 52 - enabled: true, 53 + enabled: enabled, 53 54 encoder: encoder, 54 55 decoder: decoder, 55 56 db: db, ··· 65 66 } 66 67 } 67 68 68 - // GetBundleFilename returns filename for bundle number (6-digit decimal, JSONL format) 69 - func (bm *BundleManager) GetBundleFilename(bundleNumber int) string { 70 - return fmt.Sprintf("%06d.jsonl.zst", bundleNumber) 69 + // ===== BUNDLE FILE ABSTRACTION ===== 70 + 71 + type bundleFile struct { 72 + path string 73 + operations []PLCOperation 74 + uncompressedHash string 75 + compressedHash string 71 76 } 72 77 73 - // GetBundlePath returns full path for bundle number 74 - func (bm *BundleManager) GetBundlePath(bundleNumber int) string { 75 - return filepath.Join(bm.dir, bm.GetBundleFilename(bundleNumber)) 78 + func (bm *BundleManager) newBundleFile(bundleNum int) *bundleFile { 79 + return &bundleFile{ 80 + path: filepath.Join(bm.dir, fmt.Sprintf("%06d.jsonl.zst", bundleNum)), 81 + } 76 82 } 77 83 78 - // BundleExists checks if bundle file exists locally 79 - func (bm *BundleManager) BundleExists(bundleNumber int) bool { 80 - _, err := os.Stat(bm.GetBundlePath(bundleNumber)) 84 + func (bf *bundleFile) exists() bool { 85 + _, err := os.Stat(bf.path) 81 86 return err == nil 82 87 } 83 88 84 - // LoadBundle returns exactly 1000 unique operations by fetching additional batches if needed 85 - func (bm *BundleManager) LoadBundle(ctx context.Context, bundleNumber int, plcClient *Client) ([]PLCOperation, bool, error) { 86 - if !bm.enabled { 87 - return nil, false, fmt.Errorf("bundle manager disabled") 89 + func (bm *BundleManager) load(bf *bundleFile) error { 90 + compressed, err := os.ReadFile(bf.path) 91 + if err != nil { 92 + return fmt.Errorf("read failed: %w", err) 88 93 } 89 94 90 - path := bm.GetBundlePath(bundleNumber) 95 + decompressed, err := bm.decoder.DecodeAll(compressed, nil) 96 + if err != nil { 97 + return fmt.Errorf("decompress failed: %w", err) 98 + } 91 99 92 - // Try to load from local file first 93 - if bm.BundleExists(bundleNumber) { 94 - log.Verbose("→ Loading bundle %06d from local file", bundleNumber) 100 + bf.operations = bm.parseJSONL(decompressed) 101 + return nil 102 + } 95 103 96 - // Check if bundle exists in database 97 - dbBundle, dbErr := bm.db.GetBundleByNumber(ctx, bundleNumber) 98 - bundleInDB := dbErr == nil && dbBundle != nil 104 + func (bm *BundleManager) save(bf *bundleFile) error { 105 + jsonlData := bm.serializeJSONL(bf.operations) 106 + bf.uncompressedHash = bm.hash(jsonlData) 99 107 100 - if bundleInDB { 101 - // Verify compressed file hash 102 - if dbBundle.CompressedHash != "" { 103 - valid, err := bm.verifyBundleHash(path, dbBundle.CompressedHash) 104 - if err != nil { 105 - log.Error("Warning: failed to verify compressed hash for bundle %06d: %v", bundleNumber, err) 106 - } else if !valid { 107 - log.Error("⚠ Compressed hash mismatch for bundle %06d! Re-fetching...", bundleNumber) 108 - os.Remove(path) 109 - return bm.LoadBundle(ctx, bundleNumber, plcClient) 110 - } else { 111 - log.Verbose("✓ Hash verified for bundle %06d", bundleNumber) 112 - } 113 - } 114 - } 108 + compressed := bm.encoder.EncodeAll(jsonlData, nil) 109 + bf.compressedHash = bm.hash(compressed) 115 110 116 - // Load operations from file 117 - operations, err := bm.loadBundleFromFile(path) 118 - if err != nil { 119 - return nil, false, fmt.Errorf("failed to load bundle from file: %w", err) 120 - } 111 + return os.WriteFile(bf.path, compressed, 0644) 112 + } 121 113 122 - // If not in database, index it now 123 - if !bundleInDB { 124 - // Calculate both hashes 125 - fileData, err := os.ReadFile(path) 126 - if err != nil { 127 - log.Error("Warning: failed to read file: %v", err) 128 - } else { 129 - compressedHash := bm.calculateHash(fileData) 114 + func (bm *BundleManager) parseJSONL(data []byte) []PLCOperation { 115 + var ops []PLCOperation 116 + scanner := bufio.NewScanner(bytes.NewReader(data)) 130 117 131 - // Calculate uncompressed hash 132 - var jsonlData []byte 133 - for _, op := range operations { 134 - jsonlData = append(jsonlData, op.RawJSON...) 135 - jsonlData = append(jsonlData, '\n') 136 - } 137 - uncompressedHash := bm.calculateHash(jsonlData) 118 + for scanner.Scan() { 119 + line := scanner.Bytes() 120 + if len(line) == 0 { 121 + continue 122 + } 138 123 139 - if err := bm.indexBundleWithHash(ctx, bundleNumber, operations, path, uncompressedHash, compressedHash); err != nil { 140 - log.Error("Warning: failed to index bundle: %v", err) 141 - } else { 142 - log.Info("✓ Indexed bundle %06d", bundleNumber) 143 - } 144 - } 124 + var op PLCOperation 125 + if err := json.Unmarshal(line, &op); err == nil { 126 + op.RawJSON = append([]byte(nil), line...) 127 + ops = append(ops, op) 145 128 } 146 - 147 - // If loaded from disk, it's always complete 148 - return operations, true, nil 149 129 } 150 130 151 - // Bundle doesn't exist locally - fetch from PLC directory 152 - log.Info("→ Bundle %06d not found locally, fetching from PLC directory...", bundleNumber) 153 - 154 - var afterTimestamp string 155 - var prevBoundaryCIDs map[string]bool 131 + return ops 132 + } 156 133 157 - if bundleNumber > 1 { 158 - prevBundle, err := bm.db.GetBundleByNumber(ctx, bundleNumber-1) 159 - if err == nil && prevBundle != nil { 160 - afterTimestamp = prevBundle.EndTime.Format(time.RFC3339Nano) 161 - 162 - // Get boundary CIDs from previous bundle 163 - if len(prevBundle.BoundaryCIDs) > 0 { 164 - prevBoundaryCIDs = make(map[string]bool) 165 - for _, cid := range prevBundle.BoundaryCIDs { 166 - prevBoundaryCIDs[cid] = true 167 - } 168 - log.Verbose(" Using %d boundary CIDs from previous bundle", len(prevBoundaryCIDs)) 169 - } else { 170 - // Fallback: load previous bundle's operations 171 - prevPath := bm.GetBundlePath(bundleNumber - 1) 172 - if bm.BundleExists(bundleNumber - 1) { 173 - prevOps, err := bm.loadBundleFromFile(prevPath) 174 - if err == nil { 175 - _, prevBoundaryCIDs = GetBoundaryCIDs(prevOps) 176 - log.Verbose(" Computed %d boundary CIDs from previous bundle file", len(prevBoundaryCIDs)) 177 - } 178 - } 179 - } 180 - } 134 + func (bm *BundleManager) serializeJSONL(ops []PLCOperation) []byte { 135 + var buf []byte 136 + for _, op := range ops { 137 + buf = append(buf, op.RawJSON...) 138 + buf = append(buf, '\n') 181 139 } 140 + return buf 141 + } 182 142 183 - // Collect operations until we have exactly BUNDLE_SIZE unique ones 184 - var allOperations []PLCOperation 185 - seenCIDs := make(map[string]bool) 143 + // ===== BUNDLE FETCHING ===== 186 144 187 - // Track what we've already seen from previous bundle 145 + type bundleFetcher struct { 146 + client *Client 147 + seenCIDs map[string]bool 148 + currentAfter string 149 + fetchCount int 150 + } 151 + 152 + func newBundleFetcher(client *Client, afterTime string, prevBoundaryCIDs map[string]bool) *bundleFetcher { 153 + seen := make(map[string]bool) 188 154 for cid := range prevBoundaryCIDs { 189 - seenCIDs[cid] = true 155 + seen[cid] = true 190 156 } 191 157 192 - currentAfter := afterTimestamp 193 - 194 - // Scale maxFetches based on bundle size 195 - // Assume worst case: 90% dedup rate, need buffer 196 - maxFetches := (BUNDLE_SIZE / 900) + 5 // For 10k: ~16 fetches, for 1k: ~6 fetches 197 - fetchCount := 0 198 - 199 - for len(allOperations) < BUNDLE_SIZE && fetchCount < maxFetches { 200 - fetchCount++ 201 - 202 - // Calculate how many more operations we need 203 - remaining := BUNDLE_SIZE - len(allOperations) 204 - 205 - // Determine fetch size based on remaining operations 206 - var fetchSize int 207 - if fetchCount == 1 { 208 - // First fetch: always get 1000 (PLC limit) 209 - fetchSize = 1000 210 - } else if remaining < 100 { 211 - // Need less than 100: fetch 50 212 - fetchSize = 50 213 - } else if remaining < 500 { 214 - // Need 100-500: fetch 200 215 - fetchSize = 200 216 - } else { 217 - // Need 500+: fetch 1000 218 - fetchSize = 1000 219 - } 158 + return &bundleFetcher{ 159 + client: client, 160 + seenCIDs: seen, 161 + currentAfter: afterTime, 162 + } 163 + } 220 164 221 - // Fetch next batch 222 - log.Verbose(" Fetch #%d: need %d more, requesting %d", fetchCount, remaining, fetchSize) 165 + func (bf *bundleFetcher) fetchUntilComplete(ctx context.Context, target int) ([]PLCOperation, bool) { 166 + var ops []PLCOperation 167 + maxFetches := (target / 900) + 5 223 168 224 - rawOperations, err := bm.fetchBundleFromPLCWithCount(ctx, plcClient, currentAfter, fetchSize) 225 - if err != nil { 226 - return nil, false, fmt.Errorf("failed to fetch bundle from PLC: %w", err) 227 - } 169 + for len(ops) < target && bf.fetchCount < maxFetches { 170 + bf.fetchCount++ 171 + batchSize := bf.calculateBatchSize(target - len(ops)) 228 172 229 - if len(rawOperations) == 0 { 230 - // No more data available 231 - log.Info(" No more operations available after %d fetches (got %d/%d)", 232 - fetchCount, len(allOperations), BUNDLE_SIZE) 233 - break 234 - } 173 + log.Verbose(" Fetch #%d: need %d more, requesting %d", bf.fetchCount, target-len(ops), batchSize) 235 174 236 - log.Verbose(" Got %d raw operations", len(rawOperations)) 175 + batch, shouldContinue := bf.fetchBatch(ctx, batchSize) 237 176 238 - // Filter out duplicates and add unique operations 239 - newOpsAdded := 0 240 - for _, op := range rawOperations { 241 - if !seenCIDs[op.CID] { 242 - seenCIDs[op.CID] = true 243 - allOperations = append(allOperations, op) 244 - newOpsAdded++ 177 + for _, op := range batch { 178 + if !bf.seenCIDs[op.CID] { 179 + bf.seenCIDs[op.CID] = true 180 + ops = append(ops, op) 245 181 246 - if len(allOperations) >= BUNDLE_SIZE { 247 - break 182 + if len(ops) >= target { 183 + return ops[:target], true 248 184 } 249 185 } 250 186 } 251 187 252 - log.Verbose(" Added %d unique operations (total: %d/%d, %d dupes)", 253 - newOpsAdded, len(allOperations), BUNDLE_SIZE, len(rawOperations)-newOpsAdded) 254 - 255 - // If we added no new operations, we're stuck 256 - if newOpsAdded == 0 { 257 - log.Error(" No new unique operations found, stopping") 188 + if !shouldContinue { 258 189 break 259 190 } 191 + } 260 192 261 - // Update cursor for next fetch 262 - if len(rawOperations) > 0 { 263 - lastOp := rawOperations[len(rawOperations)-1] 264 - currentAfter = lastOp.CreatedAt.Format(time.RFC3339Nano) 265 - } 193 + return ops, len(ops) >= target 194 + } 266 195 267 - // If PLC returned less than requested, we've reached the end 268 - if len(rawOperations) < fetchSize { 269 - log.Info(" Reached end of PLC data (got %d < %d requested)", len(rawOperations), fetchSize) 270 - break 271 - } 196 + func (bf *bundleFetcher) calculateBatchSize(remaining int) int { 197 + if bf.fetchCount == 0 { 198 + return 1000 199 + } 200 + if remaining < 100 { 201 + return 50 272 202 } 273 - 274 - // Warn if we hit the fetch limit 275 - if fetchCount >= maxFetches { 276 - log.Verbose(" ⚠ Hit maxFetches limit (%d) with only %d/%d operations", 277 - maxFetches, len(allOperations), BUNDLE_SIZE) 203 + if remaining < 500 { 204 + return 200 278 205 } 206 + return 1000 207 + } 279 208 280 - // Check if we got exactly BUNDLE_SIZE operations 281 - isComplete := len(allOperations) >= BUNDLE_SIZE 209 + func (bf *bundleFetcher) fetchBatch(ctx context.Context, size int) ([]PLCOperation, bool) { 210 + ops, err := bf.client.Export(ctx, ExportOptions{ 211 + Count: size, 212 + After: bf.currentAfter, 213 + }) 282 214 283 - if len(allOperations) > BUNDLE_SIZE { 284 - // Trim to exactly BUNDLE_SIZE 285 - log.Verbose(" Trimming from %d to %d operations", len(allOperations), BUNDLE_SIZE) 286 - allOperations = allOperations[:BUNDLE_SIZE] 215 + if err != nil || len(ops) == 0 { 216 + return nil, false 287 217 } 288 218 289 - log.Info(" Collected %d unique operations after %d fetches (complete=%v, target=%d)", 290 - len(allOperations), fetchCount, isComplete, BUNDLE_SIZE) 291 - 292 - // Only save as bundle if complete 293 - if isComplete { 294 - // Save bundle with both hashes 295 - uncompressedHash, compressedHash, err := bm.saveBundleFileWithHash(path, allOperations) 296 - if err != nil { 297 - log.Error("Warning: failed to save bundle file: %v", err) 298 - } else { 299 - // Index with both hashes 300 - if err := bm.indexBundleWithHash(ctx, bundleNumber, allOperations, path, uncompressedHash, compressedHash); err != nil { 301 - log.Error("Warning: failed to index bundle: %v", err) 302 - } else { 303 - log.Info("✓ Bundle %06d saved [%d ops, hash: %s, compressed: %s]", 304 - bundleNumber, len(allOperations), uncompressedHash[:16]+"...", compressedHash[:16]+"...") 305 - } 306 - } 219 + if len(ops) > 0 { 220 + bf.currentAfter = ops[len(ops)-1].CreatedAt.Format(time.RFC3339Nano) 307 221 } 308 222 309 - return allOperations, isComplete, nil 223 + return ops, len(ops) >= size 310 224 } 311 225 312 - // fetchBundleFromPLCWithCount fetches operations with a specific count 313 - func (bm *BundleManager) fetchBundleFromPLCWithCount(ctx context.Context, client *Client, afterTimestamp string, count int) ([]PLCOperation, error) { 314 - return client.Export(ctx, ExportOptions{ 315 - Count: count, 316 - After: afterTimestamp, 317 - }) 318 - } 226 + // ===== MAIN BUNDLE LOADING ===== 319 227 320 - // saveBundleFileWithHash - NO trailing newline 321 - func (bm *BundleManager) saveBundleFileWithHash(path string, operations []PLCOperation) (string, string, error) { 322 - var jsonlData []byte 323 - for _, op := range operations { 324 - jsonlData = append(jsonlData, op.RawJSON...) 325 - jsonlData = append(jsonlData, '\n') 228 + func (bm *BundleManager) LoadBundle(ctx context.Context, bundleNum int, plcClient *Client) ([]PLCOperation, bool, error) { 229 + if !bm.enabled { 230 + return nil, false, fmt.Errorf("bundle manager disabled") 326 231 } 327 232 328 - uncompressedHash := bm.calculateHash(jsonlData) 329 - compressed := bm.encoder.EncodeAll(jsonlData, nil) 330 - compressedHash := bm.calculateHash(compressed) 233 + bf := bm.newBundleFile(bundleNum) 331 234 332 - if err := os.WriteFile(path, compressed, 0644); err != nil { 333 - return "", "", err 235 + // Try local file first 236 + if bf.exists() { 237 + return bm.loadFromFile(ctx, bundleNum, bf) 334 238 } 335 239 336 - return uncompressedHash, compressedHash, nil 240 + // Fetch from PLC 241 + return bm.fetchFromPLC(ctx, bundleNum, bf, plcClient) 337 242 } 338 243 339 - // fetchBundleFromPLC fetches operations from PLC directory (returns RAW operations) 340 - func (bm *BundleManager) fetchBundleFromPLC(ctx context.Context, client *Client, afterTimestamp string) ([]PLCOperation, error) { 341 - // Just fetch - no deduplication here 342 - return client.Export(ctx, ExportOptions{ 343 - Count: 1000, 344 - After: afterTimestamp, 345 - }) 346 - } 244 + func (bm *BundleManager) loadFromFile(ctx context.Context, bundleNum int, bf *bundleFile) ([]PLCOperation, bool, error) { 245 + log.Verbose("→ Loading bundle %06d from local file", bundleNum) 347 246 348 - // StripBoundaryDuplicates removes operations that were already seen on the previous page 349 - // This is exported so it can be used in verification 350 - func StripBoundaryDuplicates(operations []PLCOperation, boundaryTimestamp string, prevBoundaryCIDs map[string]bool) []PLCOperation { 351 - if len(operations) == 0 { 352 - return operations 247 + // Verify hash if bundle is in DB 248 + if dbBundle, err := bm.db.GetBundleByNumber(ctx, bundleNum); err == nil && dbBundle != nil { 249 + if err := bm.verifyHash(bf.path, dbBundle.CompressedHash); err != nil { 250 + log.Error("⚠ Hash mismatch for bundle %06d! Re-fetching...", bundleNum) 251 + os.Remove(bf.path) 252 + return nil, false, fmt.Errorf("hash mismatch") 253 + } 254 + log.Verbose("✓ Hash verified for bundle %06d", bundleNum) 353 255 } 354 256 355 - boundaryTime, err := time.Parse(time.RFC3339Nano, boundaryTimestamp) 356 - if err != nil { 357 - return operations 257 + if err := bm.load(bf); err != nil { 258 + return nil, false, err 358 259 } 359 260 360 - // Skip operations at the start that match the boundary 361 - startIdx := 0 362 - for startIdx < len(operations) { 363 - op := operations[startIdx] 364 - 365 - // If timestamp is AFTER boundary, we're past duplicates 366 - if op.CreatedAt.After(boundaryTime) { 367 - break 368 - } 369 - 370 - // Same timestamp - check if we've seen this CID before 371 - if op.CreatedAt.Equal(boundaryTime) { 372 - if prevBoundaryCIDs[op.CID] { 373 - // This is a duplicate, skip it 374 - startIdx++ 375 - continue 376 - } 377 - // Same timestamp but new CID - keep it 378 - break 379 - } 380 - 381 - // Earlier timestamp (shouldn't happen) 382 - break 261 + // Index if not in DB 262 + if _, err := bm.db.GetBundleByNumber(ctx, bundleNum); err != nil { 263 + bf.compressedHash = bm.hashFile(bf.path) 264 + bf.uncompressedHash = bm.hash(bm.serializeJSONL(bf.operations)) 265 + bm.indexBundle(ctx, bundleNum, bf) 383 266 } 384 267 385 - return operations[startIdx:] 268 + return bf.operations, true, nil 386 269 } 387 270 388 - // Keep the private version for internal use 389 - func stripBoundaryDuplicates(operations []PLCOperation, boundaryTimestamp string, prevBoundaryCIDs map[string]bool) []PLCOperation { 390 - return StripBoundaryDuplicates(operations, boundaryTimestamp, prevBoundaryCIDs) 391 - } 271 + func (bm *BundleManager) fetchFromPLC(ctx context.Context, bundleNum int, bf *bundleFile, client *Client) ([]PLCOperation, bool, error) { 272 + log.Info("→ Bundle %06d not found locally, fetching from PLC directory...", bundleNum) 392 273 393 - // GetBoundaryCIDs returns all CIDs that share the same timestamp as the last operation 394 - func GetBoundaryCIDs(operations []PLCOperation) (time.Time, map[string]bool) { 395 - if len(operations) == 0 { 396 - return time.Time{}, nil 397 - } 274 + afterTime, prevCIDs := bm.getBoundaryInfo(ctx, bundleNum) 275 + fetcher := newBundleFetcher(client, afterTime, prevCIDs) 398 276 399 - lastOp := operations[len(operations)-1] 400 - boundaryTime := lastOp.CreatedAt 401 - cidSet := make(map[string]bool) 277 + ops, isComplete := fetcher.fetchUntilComplete(ctx, BUNDLE_SIZE) 402 278 403 - // Walk backwards from the end, collecting all CIDs with the same timestamp 404 - for i := len(operations) - 1; i >= 0; i-- { 405 - op := operations[i] 406 - if op.CreatedAt.Equal(boundaryTime) { 407 - cidSet[op.CID] = true 279 + log.Info(" Collected %d unique operations after %d fetches (complete=%v)", 280 + len(ops), fetcher.fetchCount, isComplete) 281 + 282 + if isComplete { 283 + bf.operations = ops 284 + if err := bm.save(bf); err != nil { 285 + log.Error("Warning: failed to save bundle: %v", err) 408 286 } else { 409 - // Different timestamp, we're done 410 - break 287 + bm.indexBundle(ctx, bundleNum, bf) 288 + log.Info("✓ Bundle %06d saved [%d ops, hash: %s...]", 289 + bundleNum, len(ops), bf.uncompressedHash[:16]) 411 290 } 412 291 } 413 292 414 - return boundaryTime, cidSet 415 - } 416 - 417 - // saveBundleFile (keep for compatibility, calls saveBundleFileWithHash) 418 - func (bm *BundleManager) saveBundleFile(path string, operations []PLCOperation) error { 419 - _, _, err := bm.saveBundleFileWithHash(path, operations) // ✅ All 3 values 420 - return err 293 + return ops, isComplete, nil 421 294 } 422 295 423 - // loadBundleFromFile loads operations from bundle file (JSONL format) 424 - func (bm *BundleManager) loadBundleFromFile(path string) ([]PLCOperation, error) { 425 - // Read compressed file 426 - compressedData, err := os.ReadFile(path) 427 - if err != nil { 428 - return nil, fmt.Errorf("failed to read bundle file: %w", err) 296 + func (bm *BundleManager) getBoundaryInfo(ctx context.Context, bundleNum int) (string, map[string]bool) { 297 + if bundleNum == 1 { 298 + return "", nil 429 299 } 430 300 431 - // Decompress 432 - decompressed, err := bm.decoder.DecodeAll(compressedData, nil) 301 + prevBundle, err := bm.db.GetBundleByNumber(ctx, bundleNum-1) 433 302 if err != nil { 434 - return nil, fmt.Errorf("failed to decompress bundle: %w", err) 303 + return "", nil 435 304 } 436 305 437 - // Parse JSONL (newline-delimited JSON) 438 - var operations []PLCOperation 439 - scanner := bufio.NewScanner(bytes.NewReader(decompressed)) 306 + afterTime := prevBundle.EndTime.Format(time.RFC3339Nano) 440 307 441 - lineNum := 0 442 - for scanner.Scan() { 443 - lineNum++ 444 - line := scanner.Bytes() 445 - 446 - // Skip empty lines 447 - if len(line) == 0 { 448 - continue 308 + // Return stored boundary CIDs if available 309 + if len(prevBundle.BoundaryCIDs) > 0 { 310 + cids := make(map[string]bool) 311 + for _, cid := range prevBundle.BoundaryCIDs { 312 + cids[cid] = true 449 313 } 314 + return afterTime, cids 315 + } 450 316 451 - var op PLCOperation 452 - if err := json.Unmarshal(line, &op); err != nil { 453 - return nil, fmt.Errorf("failed to parse operation on line %d: %w", lineNum, err) 317 + // Fallback: compute from file 318 + bf := bm.newBundleFile(bundleNum - 1) 319 + if bf.exists() { 320 + if err := bm.load(bf); err == nil { 321 + _, cids := GetBoundaryCIDs(bf.operations) 322 + return afterTime, cids 454 323 } 455 - 456 - // CRITICAL: Store the original raw JSON bytes 457 - op.RawJSON = make([]byte, len(line)) 458 - copy(op.RawJSON, line) 459 - 460 - operations = append(operations, op) 461 324 } 462 325 463 - if err := scanner.Err(); err != nil { 464 - return nil, fmt.Errorf("error reading JSONL: %w", err) 465 - } 466 - 467 - return operations, nil 326 + return afterTime, nil 468 327 } 469 328 470 - // indexBundleWithHash stores bundle with both hashes 471 - func (bm *BundleManager) indexBundleWithHash(ctx context.Context, bundleNumber int, operations []PLCOperation, path string, uncompressedHash, compressedHash string) error { 472 - // Get previous bundle's hash (uncompressed) 473 - var prevBundleHash string 474 - if bundleNumber > 1 { 475 - prevBundle, err := bm.db.GetBundleByNumber(ctx, bundleNumber-1) 476 - if err == nil && prevBundle != nil { 477 - prevBundleHash = prevBundle.Hash // Use uncompressed hash for chain 478 - log.Verbose(" Linking to previous bundle %06d (hash: %s)", bundleNumber-1, prevBundleHash[:16]+"...") 329 + // ===== BUNDLE INDEXING ===== 330 + 331 + func (bm *BundleManager) indexBundle(ctx context.Context, bundleNum int, bf *bundleFile) error { 332 + prevHash := "" 333 + if bundleNum > 1 { 334 + if prev, err := bm.db.GetBundleByNumber(ctx, bundleNum-1); err == nil { 335 + prevHash = prev.Hash 479 336 } 480 337 } 481 338 482 - // Extract unique DIDs 483 - didSet := make(map[string]bool) 484 - for _, op := range operations { 485 - didSet[op.DID] = true 486 - } 487 - 488 - dids := make([]string, 0, len(didSet)) 489 - for did := range didSet { 490 - dids = append(dids, did) 491 - } 492 - 493 - // Get compressed file size 494 - fileInfo, _ := os.Stat(path) 495 - compressedSize := int64(0) 496 - if fileInfo != nil { 497 - compressedSize = fileInfo.Size() 498 - } 339 + dids := bm.extractUniqueDIDs(bf.operations) 340 + fileSize := bm.getFileSize(bf.path) 499 341 500 342 bundle := &storage.PLCBundle{ 501 - BundleNumber: bundleNumber, 502 - StartTime: operations[0].CreatedAt, 503 - EndTime: operations[len(operations)-1].CreatedAt, 343 + BundleNumber: bundleNum, 344 + StartTime: bf.operations[0].CreatedAt, 345 + EndTime: bf.operations[len(bf.operations)-1].CreatedAt, 504 346 DIDs: dids, 505 - Hash: uncompressedHash, // Primary hash (JSONL) 506 - CompressedHash: compressedHash, // File integrity hash 507 - CompressedSize: compressedSize, // Compressed size 508 - PrevBundleHash: prevBundleHash, // Chain link 347 + Hash: bf.uncompressedHash, 348 + CompressedHash: bf.compressedHash, 349 + CompressedSize: fileSize, 350 + PrevBundleHash: prevHash, 509 351 Compressed: true, 510 352 CreatedAt: time.Now(), 511 353 } ··· 513 355 return bm.db.CreateBundle(ctx, bundle) 514 356 } 515 357 516 - // indexBundle (keep for compatibility) - FIX: Calculate both hashes 517 - func (bm *BundleManager) indexBundle(ctx context.Context, bundleNumber int, operations []PLCOperation, path string) error { 518 - // Calculate compressed hash from file 519 - fileData, err := os.ReadFile(path) 520 - if err != nil { 521 - return err 358 + func (bm *BundleManager) extractUniqueDIDs(ops []PLCOperation) []string { 359 + didSet := make(map[string]bool) 360 + for _, op := range ops { 361 + didSet[op.DID] = true 522 362 } 523 - compressedHash := bm.calculateHash(fileData) 524 363 525 - // Calculate uncompressed hash from operations 526 - var jsonlData []byte 527 - for _, op := range operations { 528 - jsonlData = append(jsonlData, op.RawJSON...) 529 - jsonlData = append(jsonlData, '\n') 364 + dids := make([]string, 0, len(didSet)) 365 + for did := range didSet { 366 + dids = append(dids, did) 530 367 } 531 - uncompressedHash := bm.calculateHash(jsonlData) 532 - 533 - return bm.indexBundleWithHash(ctx, bundleNumber, operations, path, uncompressedHash, compressedHash) 368 + return dids 534 369 } 535 370 536 - // Update CreateBundleFromMempool 371 + // ===== MEMPOOL BUNDLE CREATION ===== 372 + 537 373 func (bm *BundleManager) CreateBundleFromMempool(ctx context.Context, operations []PLCOperation) (int, error) { 538 374 if !bm.enabled { 539 375 return 0, fmt.Errorf("bundle manager disabled") ··· 547 383 if err != nil { 548 384 return 0, err 549 385 } 550 - bundleNumber := lastBundle + 1 386 + bundleNum := lastBundle + 1 551 387 552 - path := bm.GetBundlePath(bundleNumber) 388 + bf := bm.newBundleFile(bundleNum) 389 + bf.operations = operations 553 390 554 - // Save bundle with both hashes 555 - uncompressedHash, compressedHash, err := bm.saveBundleFileWithHash(path, operations) 556 - if err != nil { 391 + if err := bm.save(bf); err != nil { 557 392 return 0, err 558 393 } 559 394 560 - // Index bundle 561 - if err := bm.indexBundleWithHash(ctx, bundleNumber, operations, path, uncompressedHash, compressedHash); err != nil { 395 + if err := bm.indexBundle(ctx, bundleNum, bf); err != nil { 562 396 return 0, err 563 397 } 564 398 565 - log.Info("✓ Created bundle %06d from mempool (hash: %s)", 566 - bundleNumber, uncompressedHash[:16]+"...") 567 - 568 - return bundleNumber, nil 569 - } 570 - 571 - // EnsureBundleContinuity checks that all bundles from 1 to N exist 572 - func (bm *BundleManager) EnsureBundleContinuity(ctx context.Context, targetBundle int) error { 573 - if !bm.enabled { 574 - return nil 575 - } 576 - 577 - for i := 1; i < targetBundle; i++ { 578 - if !bm.BundleExists(i) { 579 - // Check if in database 580 - _, err := bm.db.GetBundleByNumber(ctx, i) 581 - if err != nil { 582 - return fmt.Errorf("bundle %06d is missing (required for continuity)", i) 583 - } 584 - } 585 - } 586 - 587 - return nil 588 - } 399 + log.Info("✓ Created bundle %06d from mempool (hash: %s...)", 400 + bundleNum, bf.uncompressedHash[:16]) 589 401 590 - // GetStats returns bundle statistics 591 - func (bm *BundleManager) GetStats(ctx context.Context) (int64, int64, error) { 592 - if !bm.enabled { 593 - return 0, 0, nil 594 - } 595 - return bm.db.GetBundleStats(ctx) 402 + return bundleNum, nil 596 403 } 597 404 598 - // calculateHash computes SHA256 hash of data 599 - func (bm *BundleManager) calculateHash(data []byte) string { 600 - hash := sha256.Sum256(data) 601 - return hex.EncodeToString(hash[:]) 602 - } 405 + // ===== VERIFICATION ===== 603 406 604 - // verifyBundleHash checks if file hash matches expected hash 605 - func (bm *BundleManager) verifyBundleHash(path string, expectedHash string) (bool, error) { 606 - data, err := os.ReadFile(path) 607 - if err != nil { 608 - return false, err 609 - } 610 - 611 - actualHash := bm.calculateHash(data) 612 - return actualHash == expectedHash, nil 613 - } 614 - 615 - // VerifyChain - FIX 616 407 func (bm *BundleManager) VerifyChain(ctx context.Context, endBundle int) error { 617 408 if !bm.enabled { 618 409 return fmt.Errorf("bundle manager disabled") ··· 626 417 return fmt.Errorf("bundle %06d not found: %w", i, err) 627 418 } 628 419 629 - // Compute file path 630 - filePath := bm.GetBundlePath(i) 631 - 632 420 // Verify file hash 633 - valid, err := bm.verifyBundleHash(filePath, bundle.CompressedHash) 634 - if err != nil { 421 + path := bm.newBundleFile(i).path 422 + if err := bm.verifyHash(path, bundle.CompressedHash); err != nil { 635 423 return fmt.Errorf("bundle %06d hash verification failed: %w", i, err) 636 - } 637 - if !valid { 638 - return fmt.Errorf("bundle %06d compressed hash mismatch!", i) 639 424 } 640 425 641 426 // Verify chain link ··· 660 445 return nil 661 446 } 662 447 663 - // GetChainInfo returns information about the bundle chain 448 + func (bm *BundleManager) EnsureBundleContinuity(ctx context.Context, targetBundle int) error { 449 + if !bm.enabled { 450 + return nil 451 + } 452 + 453 + for i := 1; i < targetBundle; i++ { 454 + if !bm.newBundleFile(i).exists() { 455 + if _, err := bm.db.GetBundleByNumber(ctx, i); err != nil { 456 + return fmt.Errorf("bundle %06d is missing (required for continuity)", i) 457 + } 458 + } 459 + } 460 + 461 + return nil 462 + } 463 + 464 + // ===== UTILITY METHODS ===== 465 + 466 + func (bm *BundleManager) hash(data []byte) string { 467 + h := sha256.Sum256(data) 468 + return hex.EncodeToString(h[:]) 469 + } 470 + 471 + func (bm *BundleManager) hashFile(path string) string { 472 + data, _ := os.ReadFile(path) 473 + return bm.hash(data) 474 + } 475 + 476 + func (bm *BundleManager) verifyHash(path, expectedHash string) error { 477 + if expectedHash == "" { 478 + return nil 479 + } 480 + 481 + actualHash := bm.hashFile(path) 482 + if actualHash != expectedHash { 483 + return fmt.Errorf("hash mismatch") 484 + } 485 + return nil 486 + } 487 + 488 + func (bm *BundleManager) getFileSize(path string) int64 { 489 + if info, err := os.Stat(path); err == nil { 490 + return info.Size() 491 + } 492 + return 0 493 + } 494 + 495 + func (bm *BundleManager) GetStats(ctx context.Context) (int64, int64, error) { 496 + if !bm.enabled { 497 + return 0, 0, nil 498 + } 499 + return bm.db.GetBundleStats(ctx) 500 + } 501 + 664 502 func (bm *BundleManager) GetChainInfo(ctx context.Context) (map[string]interface{}, error) { 665 503 lastBundle, err := bm.db.GetLastBundleNumber(ctx) 666 504 if err != nil { ··· 674 512 }, nil 675 513 } 676 514 677 - // Quick check first and last 678 - firstBundle, err := bm.db.GetBundleByNumber(ctx, 1) 679 - if err != nil { 680 - return nil, err 681 - } 682 - 683 - lastBundleData, err := bm.db.GetBundleByNumber(ctx, lastBundle) 684 - if err != nil { 685 - return nil, err 686 - } 515 + firstBundle, _ := bm.db.GetBundleByNumber(ctx, 1) 516 + lastBundleData, _ := bm.db.GetBundleByNumber(ctx, lastBundle) 687 517 688 518 return map[string]interface{}{ 689 519 "chain_length": lastBundle, ··· 694 524 "chain_head_hash": lastBundleData.Hash, 695 525 }, nil 696 526 } 527 + 528 + // ===== EXPORTED HELPERS ===== 529 + 530 + func GetBoundaryCIDs(operations []PLCOperation) (time.Time, map[string]bool) { 531 + if len(operations) == 0 { 532 + return time.Time{}, nil 533 + } 534 + 535 + lastOp := operations[len(operations)-1] 536 + boundaryTime := lastOp.CreatedAt 537 + cidSet := make(map[string]bool) 538 + 539 + for i := len(operations) - 1; i >= 0; i-- { 540 + op := operations[i] 541 + if op.CreatedAt.Equal(boundaryTime) { 542 + cidSet[op.CID] = true 543 + } else { 544 + break 545 + } 546 + } 547 + 548 + return boundaryTime, cidSet 549 + } 550 + 551 + func StripBoundaryDuplicates(operations []PLCOperation, boundaryTimestamp string, prevBoundaryCIDs map[string]bool) []PLCOperation { 552 + if len(operations) == 0 { 553 + return operations 554 + } 555 + 556 + boundaryTime, err := time.Parse(time.RFC3339Nano, boundaryTimestamp) 557 + if err != nil { 558 + return operations 559 + } 560 + 561 + startIdx := 0 562 + for startIdx < len(operations) { 563 + op := operations[startIdx] 564 + 565 + if op.CreatedAt.After(boundaryTime) { 566 + break 567 + } 568 + 569 + if op.CreatedAt.Equal(boundaryTime) && prevBoundaryCIDs[op.CID] { 570 + startIdx++ 571 + continue 572 + } 573 + 574 + break 575 + } 576 + 577 + return operations[startIdx:] 578 + }
+25 -7
internal/plc/scanner.go
··· 43 43 44 44 // ScanMetrics tracks scan progress 45 45 type ScanMetrics struct { 46 - totalProcessed int64 46 + totalFetched int64 // Total ops fetched from PLC/bundles 47 + totalProcessed int64 // Unique ops processed (after dedup) 48 + newEndpoints int64 // New endpoints discovered 47 49 endpointCounts map[string]int64 48 50 currentBundle int 49 51 startTime time.Time ··· 59 61 60 62 func (m *ScanMetrics) logSummary() { 61 63 summary := formatEndpointCounts(m.endpointCounts) 62 - log.Info("PLC scan completed: %d operations, %s in %v", 63 - m.totalProcessed, summary, time.Since(m.startTime)) 64 + if m.newEndpoints > 0 { 65 + log.Info("PLC scan completed: %d operations processed (%d fetched), %s in %v", 66 + m.totalProcessed, m.totalFetched, summary, time.Since(m.startTime)) 67 + } else { 68 + log.Info("PLC scan completed: %d operations processed (%d fetched), 0 new endpoints in %v", 69 + m.totalProcessed, m.totalFetched, time.Since(m.startTime)) 70 + } 64 71 } 65 72 66 73 func (s *Scanner) Scan(ctx context.Context) error { ··· 190 197 } 191 198 192 199 s.mergeCounts(m.endpointCounts, counts) 193 - m.totalProcessed += int64(len(ops)) 200 + m.totalProcessed += int64(len(ops)) // Unique ops after dedup 201 + m.newEndpoints += sumCounts(counts) // NEW: Track new endpoints 194 202 195 203 batchTotal := sumCounts(counts) 196 204 log.Verbose("✓ Processed bundle %06d: %d operations (after dedup), %d new endpoints", ··· 268 276 } 269 277 270 278 fetchedCount := len(ops) 279 + m.totalFetched += int64(fetchedCount) // Track all fetched 271 280 log.Verbose(" Fetched %d operations from PLC", fetchedCount) 272 281 273 282 if fetchedCount == 0 { ··· 276 285 return false, nil 277 286 } 278 287 279 - // ✅ Fix: Handle errors from GetMempoolCount 280 288 beforeCount, err := s.db.GetMempoolCount(ctx) 281 289 if err != nil { 282 290 return false, err 283 291 } 284 292 293 + endpointsBefore := sumCounts(m.endpointCounts) 285 294 if err := s.addToMempool(ctx, ops, m.endpointCounts); err != nil { 286 295 return false, err 287 296 } 297 + endpointsAfter := sumCounts(m.endpointCounts) 298 + m.newEndpoints += (endpointsAfter - endpointsBefore) // Add new endpoints found 288 299 289 300 afterCount, err := s.db.GetMempoolCount(ctx) 290 301 if err != nil { 291 302 return false, err 292 303 } 293 304 294 - m.totalProcessed += int64(fetchedCount) 305 + uniqueAdded := int64(afterCount - beforeCount) // Cast to int64 306 + m.totalProcessed += uniqueAdded // Track unique ops processed 307 + 295 308 log.Verbose(" Added %d new unique operations to mempool (%d were duplicates)", 296 - afterCount-beforeCount, fetchedCount-(afterCount-beforeCount)) 309 + uniqueAdded, int64(fetchedCount)-uniqueAdded) 297 310 298 311 // Continue only if got full batch 299 312 shouldContinue := fetchedCount >= limit ··· 347 360 } 348 361 349 362 // Process and update metrics 363 + countsBefore := sumCounts(m.endpointCounts) 350 364 counts, _ := s.processBatch(ctx, ops) 351 365 s.mergeCounts(m.endpointCounts, counts) 366 + newEndpointsFound := sumCounts(m.endpointCounts) - countsBefore 367 + 368 + m.totalProcessed += int64(len(ops)) 369 + m.newEndpoints += newEndpointsFound // NEW: Track new endpoints 352 370 m.currentBundle = bundleNum 353 371 354 372 if err := s.updateCursorForBundle(ctx, bundleNum, m.totalProcessed); err != nil {