"""
Base dataset class for piblin-jax.
Provides the abstract base class for all dataset types with metadata support.
"""
import copy
from abc import ABC
from typing import Any, Self
[docs]
class Dataset(ABC):
"""
Abstract base class for all dataset types.
All piblin-jax datasets inherit from this class and provide:
- Metadata system (conditions and details)
- Internal storage using backend arrays (JAX or NumPy)
- External NumPy conversion for API boundaries
- Immutable design for JAX compatibility
Parameters
----------
conditions : dict[str, Any] | None, optional
Experimental conditions (temperature, pressure, flow rate, etc.).
Default is empty dict.
details : dict[str, Any] | None, optional
Additional context (sample ID, operator, instrument, date, etc.).
Default is empty dict.
Attributes
----------
conditions : dict[str, Any]
Experimental conditions associated with the dataset.
details : dict[str, Any]
Additional metadata and context for the dataset.
Notes
-----
This class cannot be instantiated directly. Use one of the concrete
dataset types:
- ZeroDimensionalDataset (0D)
- OneDimensionalDataset (1D)
- TwoDimensionalDataset (2D)
- ThreeDimensionalDataset (3D)
- Histogram
- Distribution
- OneDimensionalCompositeDataset
The dataset uses an immutable design pattern to ensure compatibility
with JAX transformations (jit, grad, vmap). Arrays are stored internally
as backend arrays (JAX DeviceArray when available, NumPy ndarray otherwise)
and converted to NumPy arrays when accessed through properties.
Examples
--------
>>> from piblin_jax.data.datasets import OneDimensionalDataset
>>> import numpy as np
>>> x = np.linspace(0, 10, 100)
>>> y = np.sin(x)
>>> conditions = {"temperature": 25.0, "sample": "A"}
>>> details = {"operator": "Jane Doe", "date": "2025-10-18"}
>>> dataset = OneDimensionalDataset(
... independent_variable_data=x,
... dependent_variable_data=y,
... conditions=conditions,
... details=details
... )
>>> dataset.conditions["temperature"]
25.0
>>> type(dataset.independent_variable_data)
<class 'numpy.ndarray'>
"""
[docs]
def __init__(
self, conditions: dict[str, Any] | None = None, details: dict[str, Any] | None = None
):
"""
Initialize Dataset with metadata.
Parameters
----------
conditions : dict[str, Any] | None, optional
Experimental conditions.
details : dict[str, Any] | None, optional
Additional context and metadata.
"""
self._conditions = conditions if conditions is not None else {}
self._details = details if details is not None else {}
# Uncertainty quantification attributes (Task Group 12)
self._uncertainty_samples: dict[str, Any] | None = None
self._credible_intervals: tuple[Any, Any] | None = None
self._uncertainty_method: str | None = None
@property
def conditions(self) -> dict[str, Any]:
"""
Get experimental conditions.
:no-index:
Returns
-------
dict[str, Any]
Dictionary of experimental conditions (temperature, pressure, etc.).
Examples
--------
>>> dataset.conditions
{'temperature': 25.0, 'pressure': 1.0, 'sample': 'A'}
"""
return self._conditions
@property
def details(self) -> dict[str, Any]:
"""
Get additional dataset details.
:no-index:
Returns
-------
dict[str, Any]
Dictionary of additional context (operator, instrument, date, etc.).
Examples
--------
>>> dataset.details
{'operator': 'Jane Doe', 'instrument': 'Spectrometer X', 'date': '2025-10-18'}
"""
return self._details
@property
def has_uncertainty(self) -> bool:
"""
Check if dataset has uncertainty information.
:no-index:
Returns
-------
bool
True if dataset has uncertainty information, False otherwise.
Examples
--------
>>> dataset.has_uncertainty
False
>>> dataset_with_unc = dataset.with_uncertainty(n_samples=1000)
>>> dataset_with_unc.has_uncertainty
True
Notes
-----
This property checks for the presence of either uncertainty samples
or cached credible intervals. It does not validate the uncertainty
quantification method or parameter values.
"""
return self._uncertainty_samples is not None or self._credible_intervals is not None
@property
def uncertainty_samples(self) -> Any | None:
"""
Get uncertainty samples (if keep_samples=True was used).
:no-index:
Returns
-------
dict | None
Posterior samples from Bayesian inference if keep_samples=True,
None otherwise.
Examples
--------
>>> dataset_with_unc = dataset.with_uncertainty(
... n_samples=1000,
... method='bayesian',
... keep_samples=True
... )
>>> samples = dataset_with_unc.uncertainty_samples
>>> sigma_samples = samples['sigma']
Notes
-----
Storing samples can be memory-intensive for large datasets. Use
keep_samples=False if you only need credible intervals.
"""
return self._uncertainty_samples
@property
def credible_intervals(self) -> Any | None:
"""
Get cached credible intervals.
:no-index:
Returns
-------
tuple | None
Cached credible intervals (lower, upper) if computed,
None otherwise.
Examples
--------
>>> dataset_with_unc = dataset.with_uncertainty(n_samples=1000)
>>> intervals = dataset_with_unc.credible_intervals
>>> if intervals is not None:
... lower, upper = intervals
Notes
-----
Credible intervals are cached after computation to avoid
recomputation. Use get_credible_intervals() to compute
intervals with custom parameters.
"""
return self._credible_intervals
[docs]
def copy(self) -> Self:
"""
Create a deep copy of this dataset.
Returns
-------
Dataset
A new dataset instance with copied data and metadata.
Examples
--------
>>> dataset_copy = dataset.copy()
>>> dataset_copy.conditions is not dataset.conditions
True
Notes
-----
This creates a deep copy of all data arrays, metadata, and
uncertainty information. The copied dataset is completely
independent of the original.
"""
return copy.deepcopy(self)