+4
-7
examples/disk-read-file/main.rs
+4
-7
examples/disk-read-file/main.rs
···
38
38
39
39
let mut n = 0;
40
40
let mut zeros = 0;
41
-
let (mut rx, worker) = driver.rx(512).await?;
41
+
let mut rx = driver.to_channel(512);
42
42
43
43
log::debug!("walking...");
44
-
while let Some(pairs) = rx.recv().await {
44
+
while let Some(r) = rx.recv().await {
45
+
let pairs = r?;
45
46
n += pairs.len();
46
47
for (_, block) in pairs {
47
48
zeros += block.into_iter().filter(|&b| b == b'0').count()
48
49
}
49
50
}
50
-
log::debug!("done walking! joining...");
51
-
52
-
worker.await.unwrap().unwrap();
53
-
54
-
log::debug!("joined.");
51
+
log::debug!("done walking!");
55
52
56
53
// log::info!("now is the time to check mem...");
57
54
// tokio::time::sleep(std::time::Duration::from_secs(22)).await;
+89
-79
src/drive.rs
+89
-79
src/drive.rs
···
7
7
use serde::{Deserialize, Serialize};
8
8
use std::collections::HashMap;
9
9
use std::convert::Infallible;
10
-
use tokio::io::AsyncRead;
10
+
use tokio::{io::AsyncRead, sync::mpsc};
11
11
12
12
use crate::mst::{Commit, Node};
13
13
use crate::walk::{Step, WalkError, Walker};
···
44
44
#[error("extra bytes remained after decoding")]
45
45
ExtraGarbage,
46
46
}
47
+
48
+
pub type BlockChunk<T> = Vec<(String, T)>;
47
49
48
50
#[derive(Debug, Clone, Serialize, Deserialize)]
49
51
pub enum MaybeProcessedBlock<T> {
···
161
163
}
162
164
}
163
165
166
+
/// The core driver between the block stream and MST walker
167
+
///
168
+
/// In the future, PDSs will export CARs in a stream-friendly order that will
169
+
/// enable processing them with tiny memory overhead. But that future is not
170
+
/// here yet.
171
+
///
172
+
/// CARs are almost always in a stream-unfriendly order, so I'm reverting the
173
+
/// optimistic stream features: we load all block first, then walk the MST.
174
+
///
175
+
/// This makes things much simpler: we only need to worry about spilling to disk
176
+
/// in one place, and we always have a reasonable expecatation about how much
177
+
/// work the init function will do. We can drop the CAR reader before walking,
178
+
/// so the sync/async boundaries become a little easier to work around.
179
+
#[derive(Debug)]
180
+
pub struct MemDriver<T: Processable> {
181
+
blocks: HashMap<Cid, MaybeProcessedBlock<T>>,
182
+
walker: Walker,
183
+
process: fn(Vec<u8>) -> T,
184
+
}
185
+
186
+
impl<T: Processable> MemDriver<T> {
187
+
/// Manually step through the record outputs
188
+
pub async fn next_chunk(&mut self, n: usize) -> Result<Option<BlockChunk<T>>, DriveError> {
189
+
let mut out = Vec::with_capacity(n);
190
+
for _ in 0..n {
191
+
// walk as far as we can until we run out of blocks or find a record
192
+
match self.walker.step(&mut self.blocks, self.process)? {
193
+
Step::Missing(cid) => return Err(DriveError::MissingBlock(cid)),
194
+
Step::Finish => break,
195
+
Step::Step { rkey, data } => {
196
+
out.push((rkey, data));
197
+
continue;
198
+
}
199
+
};
200
+
}
201
+
202
+
if out.is_empty() {
203
+
Ok(None)
204
+
} else {
205
+
Ok(Some(out))
206
+
}
207
+
}
208
+
}
209
+
164
210
/// a paritally memory-loaded car file that needs disk spillover to continue
165
211
pub struct BigCar<R: AsyncRead + Unpin, T: Processable> {
166
212
car: CarReader<R>,
···
204
250
})
205
251
.await??;
206
252
207
-
let (tx, mut rx) = tokio::sync::mpsc::channel::<Vec<(Cid, MaybeProcessedBlock<T>)>>(2);
253
+
let (tx, mut rx) = mpsc::channel::<Vec<(Cid, MaybeProcessedBlock<T>)>>(2);
208
254
209
255
let store_worker = tokio::task::spawn_blocking(move || {
210
256
let mut writer = store.get_writer()?;
···
290
336
pub async fn next_chunk(
291
337
mut self,
292
338
n: usize,
293
-
) -> Result<(Self, Option<Vec<(String, T)>>), DriveError> {
339
+
) -> Result<(Self, Option<BlockChunk<T>>), DriveError> {
294
340
let mut out = Vec::with_capacity(n);
295
341
(self, out) = tokio::task::spawn_blocking(move || {
296
342
let store = self.store;
···
321
367
}
322
368
}
323
369
324
-
pub async fn rx(
370
+
fn read_tx_blocking(
325
371
mut self,
326
372
n: usize,
327
-
) -> Result<
328
-
(
329
-
tokio::sync::mpsc::Receiver<Vec<(String, T)>>,
330
-
tokio::task::JoinHandle<Result<(), DriveError>>,
331
-
),
332
-
DriveError,
333
-
> {
334
-
let (tx, rx) = tokio::sync::mpsc::channel::<Vec<(String, T)>>(1);
373
+
tx: mpsc::Sender<Result<BlockChunk<T>, DriveError>>,
374
+
) -> Result<(), mpsc::error::SendError<Result<BlockChunk<T>, DriveError>>> {
375
+
let mut reader = match self.store.get_reader() {
376
+
Ok(r) => r,
377
+
Err(e) => return tx.blocking_send(Err(e.into())),
378
+
};
335
379
336
-
// sketch: this worker is going to be allowed to execute without a join handle
337
-
// ...should we return the join handle here so the caller at least knows about it?
338
-
// yes probably for error handling?? (orrr put errors in the channel)
339
-
let worker = tokio::task::spawn_blocking(move || {
340
-
let mut reader = self.store.get_reader()?;
380
+
loop {
381
+
let mut out: BlockChunk<T> = Vec::with_capacity(n);
341
382
342
-
loop {
343
-
let mut out = Vec::with_capacity(n);
383
+
for _ in 0..n {
384
+
// walk as far as we can until we run out of blocks or find a record
344
385
345
-
for _ in 0..n {
346
-
// walk as far as we can until we run out of blocks or find a record
347
-
match self.walker.disk_step(&mut reader, self.process)? {
348
-
Step::Missing(cid) => return Err(DriveError::MissingBlock(cid)),
349
-
Step::Finish => break,
350
-
Step::Step { rkey, data } => {
351
-
out.push((rkey, data));
352
-
continue;
353
-
}
354
-
};
355
-
}
386
+
let step = match self.walker.disk_step(&mut reader, self.process) {
387
+
Ok(s) => s,
388
+
Err(e) => return tx.blocking_send(Err(e.into())),
389
+
};
356
390
357
-
if out.is_empty() {
358
-
break;
359
-
}
360
-
tx.blocking_send(out)
361
-
.map_err(|_| DriveError::ChannelSendError)?;
391
+
match step {
392
+
Step::Missing(cid) => {
393
+
return tx.blocking_send(Err(DriveError::MissingBlock(cid)));
394
+
}
395
+
Step::Finish => return Ok(()),
396
+
Step::Step { rkey, data } => {
397
+
out.push((rkey, data));
398
+
continue;
399
+
}
400
+
};
362
401
}
363
402
364
-
drop(reader); // cannot outlive store
365
-
Ok(())
366
-
}); // await later
403
+
if out.is_empty() {
404
+
break;
405
+
}
406
+
tx.blocking_send(Ok(out))?;
407
+
}
367
408
368
-
Ok((rx, worker))
409
+
Ok(())
369
410
}
370
-
}
371
411
372
-
/// The core driver between the block stream and MST walker
373
-
///
374
-
/// In the future, PDSs will export CARs in a stream-friendly order that will
375
-
/// enable processing them with tiny memory overhead. But that future is not
376
-
/// here yet.
377
-
///
378
-
/// CARs are almost always in a stream-unfriendly order, so I'm reverting the
379
-
/// optimistic stream features: we load all block first, then walk the MST.
380
-
///
381
-
/// This makes things much simpler: we only need to worry about spilling to disk
382
-
/// in one place, and we always have a reasonable expecatation about how much
383
-
/// work the init function will do. We can drop the CAR reader before walking,
384
-
/// so the sync/async boundaries become a little easier to work around.
385
-
#[derive(Debug)]
386
-
pub struct MemDriver<T: Processable> {
387
-
blocks: HashMap<Cid, MaybeProcessedBlock<T>>,
388
-
walker: Walker,
389
-
process: fn(Vec<u8>) -> T,
390
-
}
412
+
pub fn to_channel(self, n: usize) -> mpsc::Receiver<Result<BlockChunk<T>, DriveError>> {
413
+
let (tx, rx) = mpsc::channel::<Result<BlockChunk<T>, DriveError>>(1);
391
414
392
-
impl<T: Processable> MemDriver<T> {
393
-
/// Manually step through the record outputs
394
-
pub async fn next_chunk(&mut self, n: usize) -> Result<Option<Vec<(String, T)>>, DriveError> {
395
-
let mut out = Vec::with_capacity(n);
396
-
for _ in 0..n {
397
-
// walk as far as we can until we run out of blocks or find a record
398
-
match self.walker.step(&mut self.blocks, self.process)? {
399
-
Step::Missing(cid) => return Err(DriveError::MissingBlock(cid)),
400
-
Step::Finish => break,
401
-
Step::Step { rkey, data } => {
402
-
out.push((rkey, data));
403
-
continue;
404
-
}
405
-
};
406
-
}
415
+
// sketch: this worker is going to be allowed to execute without a join handle
416
+
tokio::task::spawn_blocking(move || {
417
+
if let Err(mpsc::error::SendError(_)) = self.read_tx_blocking(n, tx) {
418
+
log::debug!("big car reader exited early due to dropped receiver channel");
419
+
}
420
+
});
407
421
408
-
if out.is_empty() {
409
-
Ok(None)
410
-
} else {
411
-
Ok(Some(out))
412
-
}
422
+
rx
413
423
}
414
424
}