"""Data I/O system for piblin-jax.
This module provides a comprehensive file I/O system with:
- Generic CSV and TXT readers
- Auto-detection of file formats
- Batch reading of multiple files
- Automatic hierarchy building from file lists
- Extensible reader registry
Main Functions
--------------
read_file : Read single file with auto-detection
read_files : Read multiple files and build hierarchy
read_directory : Read all matching files in a directory
read_directories : Read multiple directories
Examples
--------
Read a single file:
>>> from piblin_jax.dataio import read_file
>>> measurement = read_file("data.csv")
Read multiple files:
>>> files = ["sample1.csv", "sample2.csv", "sample3.csv"]
>>> experiment_set = read_files(files)
Read an entire directory:
>>> experiment_set = read_directory("/path/to/data", pattern="*.csv")
Read multiple directories:
>>> paths = ["/path/to/exp1", "/path/to/exp2"]
>>> experiment_set = read_directories(paths)
"""
from collections.abc import Sequence
from pathlib import Path
from piblin_jax.data.collections import ExperimentSet
from .hierarchy import build_hierarchy
from .readers import detect_reader, read_file, register_reader
[docs]
def read_files(file_list: Sequence[str | Path]) -> ExperimentSet:
"""Read multiple files and build hierarchical structure.
Reads all files in the list, automatically detecting formats, and
organizes them into a hierarchical ExperimentSet based on their
experimental conditions.
Parameters
----------
file_list : Sequence[str | Path]
List of file paths to read
Returns
-------
ExperimentSet
Hierarchical organization of all measurements
Raises
------
FileNotFoundError
If any file in the list does not exist
ValueError
If any file cannot be parsed
Examples
--------
Read specific files:
>>> files = ["sample1.csv", "sample2.csv", "sample3.csv"]
>>> experiment_set = read_files(files)
>>> len(experiment_set.experiments)
1
With Path objects:
>>> from pathlib import Path
>>> files = list(Path("/data").glob("*.csv"))
>>> experiment_set = read_files(files)
Notes
-----
All measurements from all files are analyzed together to identify
constant and varying conditions, which determines the hierarchy
structure. Files with the same conditions are grouped together.
"""
if not file_list:
return ExperimentSet([])
# Read all files
measurements = [read_file(f) for f in file_list]
# Build hierarchy from measurements
return build_hierarchy(measurements)
[docs]
def read_directory(
path: str | Path, pattern: str = "*.csv", recursive: bool = False
) -> ExperimentSet:
"""Read all matching files in a directory.
Scans a directory for files matching the pattern, reads them all,
and builds a hierarchical structure.
Parameters
----------
path : str | Path
Directory path to scan
pattern : str, optional
Glob pattern for file matching (default: ``"*.csv"``).
Examples: "*.txt", "*.dat", "sample_*.csv"
recursive : bool, optional
If True, search recursively in subdirectories (default: False)
Returns
-------
ExperimentSet
Hierarchical organization of all measurements
Raises
------
FileNotFoundError
If the directory does not exist
ValueError
If any file cannot be parsed
Examples
--------
Read all CSV files in a directory:
>>> experiment_set = read_directory("/data/experiment1")
Read all TXT files:
>>> experiment_set = read_directory("/data/experiment1", pattern="*.txt")
Read recursively:
>>> experiment_set = read_directory(
... "/data",
... pattern="*.csv",
... recursive=True
... )
Read with custom pattern:
>>> experiment_set = read_directory(
... "/data",
... pattern="sample_A*.csv"
... )
Notes
-----
Files are sorted alphabetically before reading for consistent ordering.
All measurements are analyzed together to build the hierarchy.
"""
path = Path(path)
if not path.exists():
raise FileNotFoundError(f"Directory not found: {path}")
if not path.is_dir():
raise ValueError(f"Path is not a directory: {path}")
# Find matching files
if recursive:
files = sorted(path.rglob(pattern))
else:
files = sorted(path.glob(pattern))
if not files:
# Return empty ExperimentSet if no files found
return ExperimentSet([])
return read_files(files)
[docs]
def read_directories(
path_list: Sequence[str | Path], pattern: str = "*.csv", recursive: bool = False
) -> ExperimentSet:
"""Read multiple directories and combine into single hierarchy.
Scans multiple directories for matching files and builds a unified
hierarchical structure from all measurements.
Parameters
----------
path_list : Sequence[str | Path]
List of directory paths to scan
pattern : str, optional
Glob pattern for file matching (default: ``"*.csv"``)
recursive : bool, optional
If True, search recursively in subdirectories (default: False)
Returns
-------
ExperimentSet
Hierarchical organization of all measurements from all directories
Raises
------
FileNotFoundError
If any directory does not exist
ValueError
If any file cannot be parsed
Examples
--------
Read from multiple directories:
>>> paths = ["/data/exp1", "/data/exp2", "/data/exp3"]
>>> experiment_set = read_directories(paths)
With custom pattern:
>>> experiment_set = read_directories(
... paths,
... pattern="*.txt"
... )
Recursive search:
>>> experiment_set = read_directories(
... paths,
... recursive=True
... )
Notes
-----
All measurements from all directories are combined and analyzed together
to build a unified hierarchy. This is useful when an experiment spans
multiple directories.
"""
if not path_list:
return ExperimentSet([])
# Collect all files from all directories
all_files = []
for path in path_list:
path = Path(path)
if not path.exists():
raise FileNotFoundError(f"Directory not found: {path}")
if not path.is_dir():
raise ValueError(f"Path is not a directory: {path}")
# Find matching files
if recursive:
files = list(path.rglob(pattern))
else:
files = list(path.glob(pattern))
all_files.extend(files)
# Sort for consistent ordering
all_files = sorted(all_files)
if not all_files:
return ExperimentSet([])
return read_files(all_files)
# Re-export main functions from readers
__all__ = [
"build_hierarchy",
"detect_reader",
"read_directories",
"read_directory",
"read_file",
"read_files",
"register_reader",
]