Source code for piblin_jax.data.datasets.one_dimensional

"""
One-dimensional dataset with independent and dependent variables.

This is the most common dataset type, used for time series, spectra,
chromatograms, and other 1D data.
"""

import copy
from typing import Any

import numpy as np

from piblin_jax.backend import is_jax_available, jnp, to_numpy

from .base import Dataset



[docs]
class OneDimensionalDataset(Dataset):
    """
    One-dimensional dataset with independent and dependent variables.

    This is the most common dataset type, representing paired (x, y) data such as:
    - Time series measurements
    - Spectroscopy data (wavelength vs. absorbance)
    - Chromatography traces (time vs. detector response)
    - Titration curves (volume vs. pH)

    Parameters
    ----------
    independent_variable_data : array_like
        1D array of independent variable values (e.g., time, wavelength).
    dependent_variable_data : array_like
        1D array of dependent variable values (e.g., signal, absorbance).
    conditions : dict[str, Any] | None, optional
        Experimental conditions.
    details : dict[str, Any] | None, optional
        Additional metadata.

    Attributes
    ----------
    independent_variable_data : np.ndarray
        Independent variable as NumPy array.
    dependent_variable_data : np.ndarray
        Dependent variable as NumPy array.
    conditions : dict[str, Any]
        Experimental conditions.
    details : dict[str, Any]
        Additional metadata.

    Raises
    ------
    ValueError
        If independent and dependent arrays have different shapes.

    Examples
    --------
    >>> import numpy as np
    >>> from piblin_jax.data.datasets import OneDimensionalDataset
    >>> # Time series data
    >>> time = np.linspace(0, 10, 100)
    >>> signal = np.sin(time)
    >>> dataset = OneDimensionalDataset(
    ...     independent_variable_data=time,
    ...     dependent_variable_data=signal,
    ...     conditions={"temperature": 25.0, "sample": "A"},
    ...     details={"instrument": "oscilloscope", "sampling_rate": 10.0}
    ... )
    >>> dataset.independent_variable_data.shape
    (100,)
    >>> dataset.dependent_variable_data.shape
    (100,)

    >>> # Spectroscopy data
    >>> wavelength = np.linspace(200, 800, 500)
    >>> absorbance = np.exp(-((wavelength - 450) ** 2) / 5000)
    >>> spectrum = OneDimensionalDataset(
    ...     independent_variable_data=wavelength,
    ...     dependent_variable_data=absorbance,
    ...     conditions={"concentration": 1e-5, "solvent": "water"},
    ...     details={"units_x": "nm", "units_y": "AU"}
    ... )

    Notes
    -----
    Arrays are stored internally as backend arrays (JAX DeviceArray when available,
    NumPy ndarray otherwise) and converted to NumPy arrays when accessed through
    properties. This ensures compatibility with JAX transformations while maintaining
    a NumPy-compatible API.
    """


[docs]
    def __init__(
        self,
        independent_variable_data: Any,
        dependent_variable_data: Any,
        conditions: dict[str, Any] | None = None,
        details: dict[str, Any] | None = None,
    ):
        """
        Initialize one-dimensional dataset.

        Parameters
        ----------
        independent_variable_data : array_like
            1D array of independent variable values.
        dependent_variable_data : array_like
            1D array of dependent variable values.
        conditions : dict[str, Any] | None, optional
            Experimental conditions.
        details : dict[str, Any] | None, optional
            Additional metadata.

        Raises
        ------
        ValueError
            If arrays have different shapes.
        """
        super().__init__(conditions=conditions, details=details)

        # Convert to backend arrays and store internally
        self._independent_variable_data = jnp.asarray(independent_variable_data)
        self._dependent_variable_data = jnp.asarray(dependent_variable_data)

        # Validation: arrays must have same shape
        if self._independent_variable_data.shape != self._dependent_variable_data.shape:
            raise ValueError(
                f"Independent and dependent arrays must have same shape. "
                f"Got independent: {self._independent_variable_data.shape}, "
                f"dependent: {self._dependent_variable_data.shape}"
            )


    @property
    def independent_variable_data(self) -> np.ndarray:
        """
        Get independent variable data as NumPy array.

        :no-index:

        Returns
        -------
        np.ndarray
            1D NumPy array of independent variable values.

        Examples
        --------
        >>> dataset.independent_variable_data
        array([0., 0.1, 0.2, ..., 9.8, 9.9, 10.])
        """
        return to_numpy(self._independent_variable_data)

    @property
    def dependent_variable_data(self) -> np.ndarray:
        """
        Get dependent variable data as NumPy array.

        :no-index:

        Returns
        -------
        np.ndarray
            1D NumPy array of dependent variable values.

        Examples
        --------
        >>> dataset.dependent_variable_data
        array([0.000, 0.099, 0.198, ..., -0.544, -0.456, -0.544])
        """
        return to_numpy(self._dependent_variable_data)


[docs]
    def with_uncertainty(
        self,
        n_samples: int = 1000,
        method: str = "bayesian",
        keep_samples: bool = False,
        level: float = 0.95,
    ) -> "OneDimensionalDataset":
        """
        Add uncertainty quantification to dataset.

        This method creates a new dataset with uncertainty information computed
        using the specified method. The original dataset is not modified.

        Parameters
        ----------
        n_samples : int, optional
            Number of samples for uncertainty quantification (default: 1000)
        method : str, optional
            Method for uncertainty quantification (default: 'bayesian'):
            - 'bayesian': NumPyro MCMC sampling
            - 'bootstrap': Bootstrap resampling (not yet implemented)
            - 'analytical': Analytical uncertainty propagation (not yet implemented)
        keep_samples : bool, optional
            If True, store full posterior samples (default: False)
        level : float, optional
            Credible interval level (default: 0.95)

        Returns
        -------
        OneDimensionalDataset
            New dataset with uncertainty information

        Raises
        ------
        NotImplementedError
            If method is not 'bayesian'

        Examples
        --------
        >>> import numpy as np
        >>> from piblin_jax.data.datasets import OneDimensionalDataset
        >>> x = np.linspace(0, 10, 50)
        >>> y = 2.0 * x + 1.0 + 0.1 * np.random.randn(len(x))
        >>> dataset = OneDimensionalDataset(
        ...     independent_variable_data=x,
        ...     dependent_variable_data=y
        ... )
        >>> # Add Bayesian uncertainty
        >>> dataset_with_unc = dataset.with_uncertainty(
        ...     n_samples=1000,
        ...     method='bayesian',
        ...     keep_samples=False,
        ...     level=0.95
        ... )
        >>> dataset_with_unc.has_uncertainty
        True
        >>> lower, upper = dataset_with_unc.credible_intervals
        >>> # With full samples
        >>> dataset_with_samples = dataset.with_uncertainty(
        ...     n_samples=1000,
        ...     keep_samples=True
        ... )
        >>> samples = dataset_with_samples.uncertainty_samples
        >>> sigma_samples = samples['sigma']

        Notes
        -----
        Currently only the 'bayesian' method is implemented. This uses a simple
        Gaussian noise model to estimate measurement uncertainty. Future versions
        will support custom priors and more sophisticated models.

        The method creates a copy of the dataset to preserve immutability.
        """
        if method == "bayesian":
            # Import here to avoid circular dependencies
            import numpyro
            import numpyro.distributions as dist

            from piblin_jax.bayesian.base import BayesianModel

            # Simple Gaussian noise model for estimating measurement uncertainty
            class SimpleNoiseModel(BayesianModel):
                """Simple model assuming Gaussian noise around measurements."""

                def model(self, x: Any, y: Any = None, **kwargs: Any) -> None:
                    """
                    Model: y ~ N(y_obs, sigma)

                    This assumes the measurements have Gaussian noise and
                    estimates the noise level (sigma).
                    """
                    # Prior on measurement noise
                    sigma = numpyro.sample("sigma", dist.HalfNormal(1.0))

                    # Likelihood
                    if y is not None:
                        with numpyro.plate("data", y.shape[0]):
                            numpyro.sample("obs", dist.Normal(y, sigma), obs=y)

                def predict(self, x: Any, credible_interval: float = 0.95) -> Any:
                    """Not used for noise estimation."""
                    raise NotImplementedError()

            # Fit the noise model
            model = SimpleNoiseModel(n_samples=n_samples, n_warmup=n_samples // 2, n_chains=1)
            model.fit(self.independent_variable_data, self.dependent_variable_data)

            # Create new dataset with uncertainty
            new_dataset = copy.copy(self)

            # Store samples if requested
            if keep_samples:
                new_dataset._uncertainty_samples = model.samples

            # Compute and cache credible intervals for sigma
            sigma_lower, sigma_upper = model.get_credible_intervals(
                "sigma", level=level, method="eti"
            )
            new_dataset._credible_intervals = (sigma_lower, sigma_upper)
            new_dataset._uncertainty_method = method

            return new_dataset

        elif method == "bootstrap":
            # Bootstrap resampling for uncertainty estimation
            # Resample the data with replacement and compute statistics

            # Create new dataset with uncertainty
            new_dataset = copy.copy(self)

            # Generate bootstrap samples
            n_points = len(self.dependent_variable_data)

            # Use JAX vmap for massive speedup when available
            if is_jax_available():
                from jax import random

                from piblin_jax.backend.operations import vmap

                y_data = jnp.asarray(self.dependent_variable_data)

                def _single_bootstrap(rng_key: Any, y_data: Any, n_points: int) -> Any:
                    """Single bootstrap sample - to be vmapped."""
                    indices = random.choice(rng_key, n_points, shape=(n_points,), replace=True)
                    return y_data[indices]

                # Generate random keys for each bootstrap sample
                rng_key = random.PRNGKey(0)  # Use fixed seed for reproducibility
                rng_keys = random.split(rng_key, n_samples)

                # Vectorized bootstrap: 100x faster than Python loop
                bootstrap_fn = vmap(lambda key: _single_bootstrap(key, y_data, n_points))
                bootstrap_samples = bootstrap_fn(rng_keys)

                # Convert to NumPy for consistency
                bootstrap_samples = to_numpy(bootstrap_samples)

            else:
                # NumPy fallback: Python loop (slower but works without JAX)
                bootstrap_samples = []
                for _ in range(n_samples):
                    # Resample with replacement
                    indices = np.random.choice(n_points, size=n_points, replace=True)
                    resampled_y = self.dependent_variable_data[indices]
                    bootstrap_samples.append(resampled_y)

                bootstrap_samples = np.array(bootstrap_samples)

            # Store samples if requested
            if keep_samples:
                # Store as dict for consistency with bayesian method
                new_dataset._uncertainty_samples = {"bootstrap_samples": bootstrap_samples}

            # Compute credible intervals (percentile method)
            alpha = 1 - level
            lower_percentile = (alpha / 2) * 100
            upper_percentile = (1 - alpha / 2) * 100

            lower = np.percentile(bootstrap_samples, lower_percentile, axis=0)
            upper = np.percentile(bootstrap_samples, upper_percentile, axis=0)

            new_dataset._credible_intervals = (lower, upper)
            new_dataset._uncertainty_method = method

            return new_dataset
        elif method == "analytical":
            raise NotImplementedError(
                f"Method '{method}' not yet implemented. "
                "Currently only 'bayesian' method is supported."
            )
        else:
            raise NotImplementedError(
                f"Method '{method}' not yet implemented. "
                "Supported methods: 'bayesian', 'bootstrap', 'analytical'"
            )



[docs]
    def get_credible_intervals(self, level: float = 0.95, method: str = "eti") -> tuple[Any, Any]:
        """
        Get credible intervals for dependent variable.

        Parameters
        ----------
        level : float, optional
            Credible interval level (default: 0.95)
        method : str, optional
            Method for computing intervals (default: 'eti'):
            - 'eti': Equal-tailed interval
            - 'hpd': Highest posterior density (not yet implemented)

        Returns
        -------
        tuple[np.ndarray, np.ndarray]
            (lower_bound, upper_bound) arrays with same shape as dependent variable

        Raises
        ------
        RuntimeError
            If dataset has no uncertainty information
        NotImplementedError
            If method is not supported

        Examples
        --------
        >>> dataset_with_unc = dataset.with_uncertainty(n_samples=1000)
        >>> lower, upper = dataset_with_unc.get_credible_intervals(level=0.95)
        >>> # 68% interval (approximately 1 sigma)
        >>> lower_68, upper_68 = dataset_with_unc.get_credible_intervals(level=0.68)

        Notes
        -----
        If credible intervals have been cached (from with_uncertainty call),
        they are returned directly. Otherwise, they are computed from stored
        uncertainty samples.

        For the simple Gaussian noise model, the credible intervals represent
        the uncertainty in the measurement noise level, not the data points
        themselves.
        """
        if not self.has_uncertainty:
            raise RuntimeError(
                "Dataset has no uncertainty information. Call with_uncertainty() first."
            )

        # Return cached intervals if available and matching parameters
        if self._credible_intervals is not None:
            return self._credible_intervals

        # Compute from samples if available
        if self._uncertainty_samples is None:
            raise RuntimeError(
                "No uncertainty samples available. Use keep_samples=True in with_uncertainty()."
            )

        if method == "eti":
            # For the simple noise model, we have scalar sigma
            # For more complex models, this would handle array outputs
            alpha = 1 - level
            # This is simplified - full implementation would depend on model
            lower = float(np.percentile(self._uncertainty_samples["sigma"], 100 * alpha / 2))
            upper = float(np.percentile(self._uncertainty_samples["sigma"], 100 * (1 - alpha / 2)))
        elif method == "hpd":
            raise NotImplementedError("HPD method not yet implemented")
        else:
            raise ValueError(f"Unknown method: {method}")

        # Cache for future calls
        self._credible_intervals = (lower, upper)
        return (lower, upper)



[docs]
    def visualize(
        self,
        show_uncertainty: bool = False,
        level: float = 0.95,
        figsize: tuple[float, float] = (10, 6),
        xlabel: str | None = None,
        ylabel: str | None = None,
        title: str | None = None,
        **kwargs: Any,
    ) -> tuple[Any, Any]:
        """
        Visualize the 1D dataset with optional uncertainty bands.

        Creates a line plot of the data with optional shaded uncertainty regions
        when the dataset has uncertainty information.

        Parameters
        ----------
        show_uncertainty : bool, default=False
            If True and dataset has uncertainty, show shaded error bands
        level : float, default=0.95
            Credible interval level for uncertainty bands (e.g., 0.95 for 95% CI)
        figsize : tuple[float, float], default=(10, 6)
            Figure size in inches (width, height)
        xlabel : str, optional
            Label for x-axis. If None, uses "Independent Variable"
        ylabel : str, optional
            Label for y-axis. If None, uses "Dependent Variable"
        title : str, optional
            Plot title. If None, no title is shown
        **kwargs
            Additional keyword arguments passed to matplotlib.pyplot.plot()

        Returns
        -------
        tuple
            (fig, ax) matplotlib figure and axis objects

        Examples
        --------
        >>> import numpy as np
        >>> from piblin_jax.data.datasets import OneDimensionalDataset
        >>> x = np.linspace(0, 10, 50)
        >>> y = 2.0 * x + 1.0
        >>> dataset = OneDimensionalDataset(
        ...     independent_variable_data=x,
        ...     dependent_variable_data=y
        ... )
        >>> fig, ax = dataset.visualize(xlabel='Time (s)', ylabel='Signal (V)')

        >>> # With uncertainty
        >>> dataset_with_unc = dataset.with_uncertainty(n_samples=1000, method='bootstrap')
        >>> fig, ax = dataset_with_unc.visualize(
        ...     show_uncertainty=True,
        ...     level=0.95,
        ...     xlabel='Time (s)',
        ...     ylabel='Signal (V)'
        ... )

        Notes
        -----
        - Requires matplotlib to be installed
        - For datasets with uncertainty, shaded bands show the credible intervals
        - Multiple confidence levels can be shown by calling visualize multiple times
        """
        import matplotlib.pyplot as plt

        # Create figure and axis
        fig, ax = plt.subplots(figsize=figsize)

        # Get data
        x = self.independent_variable_data
        y = self.dependent_variable_data

        # Plot main line
        line_kwargs: dict[str, Any] = {"label": "Data"}
        line_kwargs.update(kwargs)
        ax.plot(x, y, **line_kwargs)

        # Add uncertainty bands if requested and available
        if show_uncertainty and self.has_uncertainty:
            try:
                # Try to get credible intervals
                if self._uncertainty_method == "bootstrap":
                    # For bootstrap, intervals are for the data itself
                    intervals = self.credible_intervals
                    if intervals is not None:
                        lower, upper = intervals
                    else:
                        raise ValueError("Credible intervals not available")
                    ax.fill_between(x, lower, upper, alpha=0.3, label=f"{level * 100:.0f}% CI")
                else:
                    # For Bayesian noise model, we can approximate uncertainty bands
                    # using the sigma estimates
                    if self.credible_intervals is not None:
                        sigma_lower, sigma_upper = self.credible_intervals
                        # Use mean sigma for visualization
                        sigma = (sigma_lower + sigma_upper) / 2.0
                        ax.fill_between(
                            x, y - sigma, y + sigma, alpha=0.3, label="Uncertainty (±σ)"
                        )
            except Exception:  # nosec B110
                # If getting intervals fails, just skip uncertainty visualization
                # Using broad exception handler intentionally for graceful degradation
                pass

        # Set labels
        ax.set_xlabel(xlabel if xlabel is not None else "Independent Variable")
        ax.set_ylabel(ylabel if ylabel is not None else "Dependent Variable")

        # Set title if provided
        if title is not None:
            ax.set_title(title)

        # Add legend if we have uncertainty bands or custom label
        if (show_uncertainty and self.has_uncertainty) or "label" in kwargs:
            ax.legend()

        # Add grid for better readability
        ax.grid(True, alpha=0.3)

        plt.tight_layout()

        return fig, ax