Source code for piblin_jax.data.datasets.base

"""
Base dataset class for piblin-jax.

Provides the abstract base class for all dataset types with metadata support.
"""

import copy
from abc import ABC
from typing import Any, Self



[docs]
class Dataset(ABC):
    """
    Abstract base class for all dataset types.

    All piblin-jax datasets inherit from this class and provide:
    - Metadata system (conditions and details)
    - Internal storage using backend arrays (JAX or NumPy)
    - External NumPy conversion for API boundaries
    - Immutable design for JAX compatibility

    Parameters
    ----------
    conditions : dict[str, Any] | None, optional
        Experimental conditions (temperature, pressure, flow rate, etc.).
        Default is empty dict.
    details : dict[str, Any] | None, optional
        Additional context (sample ID, operator, instrument, date, etc.).
        Default is empty dict.

    Attributes
    ----------
    conditions : dict[str, Any]
        Experimental conditions associated with the dataset.
    details : dict[str, Any]
        Additional metadata and context for the dataset.

    Notes
    -----
    This class cannot be instantiated directly. Use one of the concrete
    dataset types:
    - ZeroDimensionalDataset (0D)
    - OneDimensionalDataset (1D)
    - TwoDimensionalDataset (2D)
    - ThreeDimensionalDataset (3D)
    - Histogram
    - Distribution
    - OneDimensionalCompositeDataset

    The dataset uses an immutable design pattern to ensure compatibility
    with JAX transformations (jit, grad, vmap). Arrays are stored internally
    as backend arrays (JAX DeviceArray when available, NumPy ndarray otherwise)
    and converted to NumPy arrays when accessed through properties.

    Examples
    --------
    >>> from piblin_jax.data.datasets import OneDimensionalDataset
    >>> import numpy as np
    >>> x = np.linspace(0, 10, 100)
    >>> y = np.sin(x)
    >>> conditions = {"temperature": 25.0, "sample": "A"}
    >>> details = {"operator": "Jane Doe", "date": "2025-10-18"}
    >>> dataset = OneDimensionalDataset(
    ...     independent_variable_data=x,
    ...     dependent_variable_data=y,
    ...     conditions=conditions,
    ...     details=details
    ... )
    >>> dataset.conditions["temperature"]
    25.0
    >>> type(dataset.independent_variable_data)
    <class 'numpy.ndarray'>
    """


[docs]
    def __init__(
        self, conditions: dict[str, Any] | None = None, details: dict[str, Any] | None = None
    ):
        """
        Initialize Dataset with metadata.

        Parameters
        ----------
        conditions : dict[str, Any] | None, optional
            Experimental conditions.
        details : dict[str, Any] | None, optional
            Additional context and metadata.
        """
        self._conditions = conditions if conditions is not None else {}
        self._details = details if details is not None else {}

        # Uncertainty quantification attributes (Task Group 12)
        self._uncertainty_samples: dict[str, Any] | None = None
        self._credible_intervals: tuple[Any, Any] | None = None
        self._uncertainty_method: str | None = None


    @property
    def conditions(self) -> dict[str, Any]:
        """
        Get experimental conditions.

        :no-index:

        Returns
        -------
        dict[str, Any]
            Dictionary of experimental conditions (temperature, pressure, etc.).

        Examples
        --------
        >>> dataset.conditions
        {'temperature': 25.0, 'pressure': 1.0, 'sample': 'A'}
        """
        return self._conditions

    @property
    def details(self) -> dict[str, Any]:
        """
        Get additional dataset details.

        :no-index:

        Returns
        -------
        dict[str, Any]
            Dictionary of additional context (operator, instrument, date, etc.).

        Examples
        --------
        >>> dataset.details
        {'operator': 'Jane Doe', 'instrument': 'Spectrometer X', 'date': '2025-10-18'}
        """
        return self._details

    @property
    def has_uncertainty(self) -> bool:
        """
        Check if dataset has uncertainty information.

        :no-index:

        Returns
        -------
        bool
            True if dataset has uncertainty information, False otherwise.

        Examples
        --------
        >>> dataset.has_uncertainty
        False
        >>> dataset_with_unc = dataset.with_uncertainty(n_samples=1000)
        >>> dataset_with_unc.has_uncertainty
        True

        Notes
        -----
        This property checks for the presence of either uncertainty samples
        or cached credible intervals. It does not validate the uncertainty
        quantification method or parameter values.
        """
        return self._uncertainty_samples is not None or self._credible_intervals is not None

    @property
    def uncertainty_samples(self) -> Any | None:
        """
        Get uncertainty samples (if keep_samples=True was used).

        :no-index:

        Returns
        -------
        dict | None
            Posterior samples from Bayesian inference if keep_samples=True,
            None otherwise.

        Examples
        --------
        >>> dataset_with_unc = dataset.with_uncertainty(
        ...     n_samples=1000,
        ...     method='bayesian',
        ...     keep_samples=True
        ... )
        >>> samples = dataset_with_unc.uncertainty_samples
        >>> sigma_samples = samples['sigma']

        Notes
        -----
        Storing samples can be memory-intensive for large datasets. Use
        keep_samples=False if you only need credible intervals.
        """
        return self._uncertainty_samples

    @property
    def credible_intervals(self) -> Any | None:
        """
        Get cached credible intervals.

        :no-index:

        Returns
        -------
        tuple | None
            Cached credible intervals (lower, upper) if computed,
            None otherwise.

        Examples
        --------
        >>> dataset_with_unc = dataset.with_uncertainty(n_samples=1000)
        >>> intervals = dataset_with_unc.credible_intervals
        >>> if intervals is not None:
        ...     lower, upper = intervals

        Notes
        -----
        Credible intervals are cached after computation to avoid
        recomputation. Use get_credible_intervals() to compute
        intervals with custom parameters.
        """
        return self._credible_intervals


[docs]
    def copy(self) -> Self:
        """
        Create a deep copy of this dataset.

        Returns
        -------
        Dataset
            A new dataset instance with copied data and metadata.

        Examples
        --------
        >>> dataset_copy = dataset.copy()
        >>> dataset_copy.conditions is not dataset.conditions
        True

        Notes
        -----
        This creates a deep copy of all data arrays, metadata, and
        uncertainty information. The copied dataset is completely
        independent of the original.
        """
        return copy.deepcopy(self)