comparing main and dev on vidyasagarv.com/py-oxyroot

+4

.gitignore

··· 76 76 77 77 # Marimo cache 78 78 __marimo__/ 79 + 80 + # Large files 81 + *.root 82 + *.pq

+8 -2

Cargo.toml

··· 10 10 11 11 [dependencies] 12 12 oxyroot = "0.1.25" 13 - pyo3 = { version = "0.26.0", features = ["abi3-py38"] } 13 + pyo3 = { version = "0.25.0", features = ["abi3-py38"] } 14 14 parking_lot = "0.12.3" 15 - numpy = "0.26.0" 15 + numpy = "0.25.0" 16 16 parquet = { version = "53.0.0", features = ["arrow"] } 17 17 arrow = "53.0.0" 18 + polars = { version = "0.50.0", features = ["diagonal_concat"] } 19 + pyo3-polars = "0.23.1" 20 + rayon = "1.10.0" 21 + glob = "0.3.1" 22 + once_cell = "1.19.0" 23 + num_cpus = "1.16.0"

+28 -6

README.md

··· 8 8 9 9 A fast, Rust-powered Python reader for CERN ROOT files. 10 10 11 - This package provides a simple and Pythonic interface bindings to `oxyroot`, a rust package, to read data from `.root` files, inspired by libraries like `uproot`. It leverages the speed of Rust for high-performance data extraction and integrates with the scientific Python ecosystem by providing data as NumPy arrays. 11 + This python package provides simple bindings to [`oxyroot`, a rust package](https://github.com/m-dupont/oxyroot), to read data from `.root` files, inspired by libraries like `uproot`. It leverages the speed of Rust and integrates with the scientific Python ecosystem by providing data as NumPy arrays or polars dataframes. 12 12 13 13 ## Features 14 14 15 - - **High-Performance**: Core logic is written in Rust for maximum speed. 16 - - **Parquet Conversion**: Convert TTrees directly to Apache Parquet files with a single command. 17 - - **NumPy Integration**: Get branch data directly as NumPy arrays. 18 - - **Simple, Pythonic API**: Easy to learn and use, and similar to `uproot` 15 + - Simple API similar to `uproot` 16 + - Core logic is written in Rust. 17 + - Get branch data directly as NumPy arrays or Polars dataframe. 18 + - Parquet Conversion: Convert TTrees directly to Apache Parquet files with a single command. 19 19 20 20 ## Quick Start 21 21 ··· 58 58 ) 59 59 ``` 60 60 61 + ## Combining Multiple Files 62 + 63 + You can efficiently read and concatenate a TTree from multiple ROOT files into a single Polars DataFrame using `concat_trees`. This function processes files in parallel to maximize performance. 64 + 65 + ```python 66 + import oxyroot 67 + 68 + # Combine trees from multiple files using a wildcard 69 + df = oxyroot.concat_trees(paths=["ntuples*.root"], tree_name="mu_mc") 70 + 71 + print(df) 72 + 73 + # You can also provide a list of specific files 74 + # df = oxyroot.concat_trees(paths=["file1.root", "file2.root"], tree_name="my_tree") 75 + 76 + # Control the number of threads used for parallel processing 77 + # By default, it uses half the available CPU cores. 78 + oxyroot.set_num_threads(4) 79 + ``` 80 + 61 81 ## Performance 62 82 63 - `oxyroot` is designed to be fast. Here is a simple benchmark comparing the time taken to read all branches of a TTree with `uproot` and `oxyroot`. 83 + `oxyroot` is intended to be fast. Here is a simple benchmark comparing the time taken to read all branches of a TTree with `uproot` and `oxyroot`. 64 84 65 85 ```python 66 86 import oxyroot ··· 87 107 end_time = time.time() 88 108 print(f"Oxyroot took: {end_time - start_time:.3f}s") 89 109 ``` 110 + 111 + On a small file (~20 MB) with multiple data formats, oxyroot took half the time of uproot, and also read in the branch with strings! 90 112 91 113 ## License 92 114

+5

pyproject.toml

··· 36 36 ] 37 37 dynamic = ["version"] 38 38 39 + dependencies = [ 40 + "numpy", 41 + "polars>=1.8.2", 42 + ] 43 + 39 44 [project.urls] 40 45 Homepage = "https://github.com/vvsagar/py-oxyroot" 41 46 Repository = "https://github.com/vvsagar/py-oxyroot.git"

+31

python/oxyroot/__init__.pyi

··· 1 1 from typing import Iterator, List, Optional 2 2 import numpy as np 3 + import polars as pl 3 4 4 5 class RootFile: 5 6 path: str ··· 13 14 def branches(self) -> List[str]: ... 14 15 def __getitem__(self, name: str) -> Branch: ... 15 16 def __iter__(self) -> Iterator[Branch]: ... 17 + def arrays(self, columns:Optional[List[str]] = None, ignore_columns: Optional[List[str]] = None) -> pl.DataFrame ... 16 18 def to_parquet(self, output_file: str, overwrite: bool = False, compression: str = "snappy", columns: Optional[List[str]] = None) -> None: ... 17 19 18 20 class Branch: ··· 40 42 A RootFile object. 41 43 """ 42 44 ... 45 + 46 + def concat_trees( 47 + paths: List[str], 48 + tree_name: str, 49 + columns: Optional[List[str]] = None, 50 + ignore_columns: Optional[List[str]] = None, 51 + ) -> pl.DataFrame: 52 + """ 53 + Reads multiple ROOT files, concatenates the specified tree, and returns a single Polars DataFrame. 54 + 55 + Args: 56 + paths: A list of paths to the ROOT files. Wildcards are supported. 57 + tree_name: The name of the tree to read from each file. 58 + columns: An optional list of column names to include. If None, all columns are included. 59 + ignore_columns: An optional list of column names to exclude. 60 + 61 + Returns: 62 + A single Polars DataFrame containing the concatenated data. 63 + """ 64 + ... 65 + 66 + def set_num_threads(num_threads: int) -> None: 67 + """ 68 + Sets the number of threads to use for parallel operations. 69 + 70 + Args: 71 + num_threads: The number of threads to use. 72 + """ 73 + ...

+154

src/lib.rs

··· 11 11 }; 12 12 use arrow::datatypes::{DataType, Field, Schema}; 13 13 use arrow::record_batch::RecordBatch; 14 + use once_cell::sync::Lazy; 15 + use parking_lot::Mutex; 14 16 use parquet::arrow::ArrowWriter; 15 17 use parquet::basic::{BrotliLevel, Compression, GzipLevel, ZstdLevel}; 16 18 use parquet::file::properties::WriterProperties; 19 + use polars::functions::concat_df_diagonal; 20 + use polars::prelude::*; 21 + use pyo3_polars::PyDataFrame; 22 + use rayon::prelude::*; 23 + 24 + static POOL: Lazy<Mutex<rayon::ThreadPool>> = Lazy::new(|| { 25 + let num_threads = std::cmp::max(1, num_cpus::get() / 2); 26 + let pool = rayon::ThreadPoolBuilder::new() 27 + .num_threads(num_threads) 28 + .build() 29 + .unwrap(); 30 + Mutex::new(pool) 31 + }); 32 + 33 + #[pyfunction] 34 + fn set_num_threads(num_threads: usize) -> PyResult<()> { 35 + let pool = rayon::ThreadPoolBuilder::new() 36 + .num_threads(num_threads) 37 + .build() 38 + .map_err(|e| PyValueError::new_err(e.to_string()))?; 39 + *POOL.lock() = pool; 40 + Ok(()) 41 + } 17 42 18 43 #[pyclass(name = "RootFile")] 19 44 struct PyRootFile { ··· 39 64 name: String, 40 65 } 41 66 67 + fn tree_to_dataframe( 68 + tree: &::oxyroot::ReaderTree, 69 + columns: Option<Vec<String>>, 70 + ignore_columns: Option<Vec<String>>, 71 + ) -> PyResult<DataFrame> { 72 + let mut branches_to_save = if let Some(columns) = columns { 73 + columns 74 + } else { 75 + tree.branches().map(|b| b.name().to_string()).collect() 76 + }; 77 + 78 + if let Some(ignore_columns) = ignore_columns { 79 + branches_to_save.retain(|c| !ignore_columns.contains(c)); 80 + } 81 + 82 + let mut series_vec = Vec::new(); 83 + 84 + for branch_name in branches_to_save { 85 + let branch = match tree.branch(&branch_name) { 86 + Some(branch) => branch, 87 + None => { 88 + println!("Branch '{}' not found, skipping", branch_name); 89 + continue; 90 + } 91 + }; 92 + 93 + let series = match branch.item_type_name().as_str() { 94 + "float" => { 95 + let data = branch.as_iter::<f32>().unwrap().collect::<Vec<_>>(); 96 + Series::new((&branch_name).into(), data) 97 + } 98 + "double" => { 99 + let data = branch.as_iter::<f64>().unwrap().collect::<Vec<_>>(); 100 + Series::new((&branch_name).into(), data) 101 + } 102 + "int32_t" => { 103 + let data = branch.as_iter::<i32>().unwrap().collect::<Vec<_>>(); 104 + Series::new((&branch_name).into(), data) 105 + } 106 + "int64_t" => { 107 + let data = branch.as_iter::<i64>().unwrap().collect::<Vec<_>>(); 108 + Series::new((&branch_name).into(), data) 109 + } 110 + "uint32_t" => { 111 + let data = branch.as_iter::<u32>().unwrap().collect::<Vec<_>>(); 112 + Series::new((&branch_name).into(), data) 113 + } 114 + "uint64_t" => { 115 + let data = branch.as_iter::<u64>().unwrap().collect::<Vec<_>>(); 116 + Series::new((&branch_name).into(), data) 117 + } 118 + "string" => { 119 + let data = branch.as_iter::<String>().unwrap().collect::<Vec<_>>(); 120 + Series::new((&branch_name).into(), data) 121 + } 122 + other => { 123 + println!("Unsupported branch type: {}, skipping", other); 124 + continue; 125 + } 126 + }; 127 + series_vec.push(series); 128 + } 129 + 130 + DataFrame::new(series_vec.into_iter().map(|s| s.into()).collect()) 131 + .map_err(|e| PyValueError::new_err(e.to_string())) 132 + } 133 + 42 134 #[pymethods] 43 135 impl PyRootFile { 44 136 #[new] ··· 92 184 branches: branches.into_iter(), 93 185 }, 94 186 ) 187 + } 188 + 189 + #[pyo3(signature = (columns = None, ignore_columns = None))] 190 + fn arrays( 191 + &self, 192 + columns: Option<Vec<String>>, 193 + ignore_columns: Option<Vec<String>>, 194 + ) -> PyResult<PyDataFrame> { 195 + let mut file = 196 + RootFile::open(&self.path).map_err(|e| PyValueError::new_err(e.to_string()))?; 197 + let tree = file 198 + .get_tree(&self.name) 199 + .map_err(|e| PyValueError::new_err(e.to_string()))?; 200 + let df = tree_to_dataframe(&tree, columns, ignore_columns)?; 201 + Ok(PyDataFrame(df)) 95 202 } 96 203 97 204 #[pyo3(signature = (output_file, overwrite = false, compression = "snappy", columns = None))] ··· 321 428 Ok(env!("CARGO_PKG_VERSION").to_string()) 322 429 } 323 430 431 + #[pyfunction] 432 + #[pyo3(signature = (paths, tree_name, columns = None, ignore_columns = None))] 433 + fn concat_trees( 434 + paths: Vec<String>, 435 + tree_name: String, 436 + columns: Option<Vec<String>>, 437 + ignore_columns: Option<Vec<String>>, 438 + ) -> PyResult<PyDataFrame> { 439 + let mut all_paths = Vec::new(); 440 + for path in paths { 441 + for entry in glob::glob(&path).map_err(|e| PyValueError::new_err(e.to_string()))? { 442 + match entry { 443 + Ok(path) => { 444 + all_paths.push(path.to_str().unwrap().to_string()); 445 + } 446 + Err(e) => return Err(PyValueError::new_err(e.to_string())), 447 + } 448 + } 449 + } 450 + 451 + let pool = POOL.lock(); 452 + let dfs: Vec<DataFrame> = pool.install(|| { 453 + all_paths 454 + .par_iter() 455 + .map(|path| { 456 + let mut file = 457 + RootFile::open(path).map_err(|e| PyValueError::new_err(e.to_string()))?; 458 + let tree = file 459 + .get_tree(&tree_name) 460 + .map_err(|e| PyValueError::new_err(e.to_string()))?; 461 + tree_to_dataframe(&tree, columns.clone(), ignore_columns.clone()) 462 + }) 463 + .filter_map(Result::ok) 464 + .collect() 465 + }); 466 + 467 + if dfs.is_empty() { 468 + return Ok(PyDataFrame(DataFrame::default())); 469 + } 470 + 471 + let combined_df = concat_df_diagonal(&dfs).map_err(|e| PyValueError::new_err(e.to_string()))?; 472 + 473 + Ok(PyDataFrame(combined_df)) 474 + } 475 + 324 476 /// A Python module to read root files, implemented in Rust. 325 477 #[pymodule] 326 478 fn oxyroot(m: &Bound<'_, PyModule>) -> PyResult<()> { 327 479 m.add_function(wrap_pyfunction!(version, m)?)?; 328 480 m.add_function(wrap_pyfunction!(open, m)?)?; 481 + m.add_function(wrap_pyfunction!(concat_trees, m)?)?; 482 + m.add_function(wrap_pyfunction!(set_num_threads, m)?)?; 329 483 m.add_class::<PyRootFile>()?; 330 484 m.add_class::<PyTree>()?; 331 485 m.add_class::<PyBranch>()?;

Compare changes