"""File readers and auto-detection system.
This module provides:
- Generic CSV and TXT readers
- Multi-layer auto-detection system
- Extensible reader registry
- read_file function for automatic file reading
The auto-detection system uses four layers:
1. Extension-based (.csv, .txt, etc.)
2. Header-based (instrument signatures)
3. Content-based (parse first lines)
4. Fallback to generic readers
"""
from collections.abc import Callable
from pathlib import Path
from typing import Any
from piblin_jax.data.collections import Measurement
from .csv import GenericCSVReader
from .txt import GenericTXTReader
# Reader registry mapping file extensions to reader classes or factory functions
_READER_REGISTRY: dict[str, type | Callable[[], Any]] = {
".csv": GenericCSVReader,
".txt": GenericTXTReader,
".tsv": lambda: GenericCSVReader(delimiter="\t"),
".dat": GenericTXTReader, # Common data file extension
".data": GenericTXTReader,
}
[docs]
def register_reader(extension: str, reader_class: type | Callable[[], Any]) -> None:
"""Register a custom reader for a file extension.
This allows users to add support for custom file formats without modifying
the core library.
Parameters
----------
extension : str
File extension (should include the dot, e.g., ".xyz")
reader_class : Type | Callable
Reader class or factory function that returns a reader instance.
The reader must implement a ``read(filepath)`` method that returns
a Measurement object.
Examples
--------
Register a custom reader class:
>>> class MyCustomReader:
... def read(self, filepath):
... # ... custom reading logic
... pass
>>> register_reader('.xyz', MyCustomReader)
Register a factory function:
>>> register_reader('.custom', lambda: GenericCSVReader(delimiter='|'))
Notes
-----
Custom readers should follow the same interface as GenericCSVReader,
implementing a ``read(filepath)`` method that returns a Measurement.
"""
_READER_REGISTRY[extension.lower()] = reader_class
[docs]
def detect_reader(filepath: str | Path) -> GenericCSVReader | GenericTXTReader:
"""Auto-detect appropriate reader for file.
Uses a multi-layer detection strategy:
1. **Extension-based**: Matches file extension to registered readers
2. **Header-based**: Checks file headers for instrument signatures (future)
3. **Content-based**: Analyzes file content structure (future)
4. **Fallback**: Returns generic reader based on best guess
Parameters
----------
filepath : str | Path
Path to file
Returns
-------
Reader instance
Instance of appropriate reader class
Examples
--------
>>> reader = detect_reader("data.csv")
>>> isinstance(reader, GenericCSVReader)
True
>>> reader = detect_reader("data.txt")
>>> isinstance(reader, GenericTXTReader)
True
Notes
-----
Currently implements Layer 1 (extension-based) and Layer 4 (fallback).
Layers 2 and 3 are reserved for future extensions to detect specific
instrument file formats.
"""
filepath = Path(filepath)
# Layer 1: Extension-based detection
ext = filepath.suffix.lower()
if ext in _READER_REGISTRY:
reader_class = _READER_REGISTRY[ext]
if callable(reader_class) and not isinstance(reader_class, type):
# Factory function
return reader_class() # type: ignore[no-any-return]
else:
# Class constructor
return reader_class() # type: ignore[no-any-return]
# Layer 2: Header-based detection (future implementation)
# Could read first few lines and check for instrument signatures
# e.g., "# Keithley 2400", "# Agilent 34401A"
# Layer 3: Content-based detection (future implementation)
# Could analyze file structure to determine format
# e.g., detect delimiter, number of columns, data types
# Layer 4: Fallback to generic readers
# Try to make an educated guess based on extension
if ext in [".dat", ".data", ""]:
return GenericTXTReader()
else:
# Default to CSV reader for unknown extensions
return GenericCSVReader()
[docs]
def read_file(filepath: str | Path) -> Measurement:
"""Read file with automatic format detection.
This is the main entry point for reading individual files. It automatically
detects the file format and uses the appropriate reader.
Parameters
----------
filepath : str | Path
Path to file
Returns
-------
Measurement
Measurement object containing datasets and metadata
Raises
------
FileNotFoundError
If the file does not exist
ValueError
If the file format is invalid or cannot be parsed
Examples
--------
Read a CSV file:
>>> measurement = read_file("data.csv")
Read a TXT file:
>>> measurement = read_file("experiment.txt")
Read with explicit path:
>>> from pathlib import Path
>>> measurement = read_file(Path("/data/experiment/sample1.csv"))
Notes
-----
This function combines detection and reading in a single call. For more
control over the reading process, you can use ``detect_reader()`` followed
by calling the reader's ``read()`` method directly.
"""
reader = detect_reader(filepath)
return reader.read(filepath)
# Export public API
__all__ = [
"GenericCSVReader",
"GenericTXTReader",
"detect_reader",
"read_file",
"register_reader",
]