Source code for piblin_jax.data.datasets.composite

"""
Composite one-dimensional dataset with multiple dependent variables.

Used for multi-channel instrument data where multiple signals share the
same independent variable (e.g., time, wavelength).
"""

from typing import Any

import numpy as np

from piblin_jax.backend import jnp, to_numpy

from .base import Dataset



[docs]
class OneDimensionalCompositeDataset(Dataset):
    """
    Composite 1D dataset with shared independent variable and multiple dependents.

    This dataset type represents multi-channel or multi-detector data where
    multiple signals share the same independent variable:
    - Multi-detector chromatography (UV, fluorescence, conductivity)
    - Multi-channel spectroscopy
    - Multi-sensor time series
    - Parallel measurements with shared axis

    Parameters
    ----------
    independent_variable_data : array_like
        1D array of independent variable (time, wavelength, etc.) shared by
        all channels.
    dependent_variable_data_list : list of array_like
        List of 1D arrays, each representing a different channel/detector.
        All must have the same length as independent_variable_data.
    conditions : dict[str, Any] | None, optional
        Experimental conditions.
    details : dict[str, Any] | None, optional
        Additional metadata.

    Attributes
    ----------
    independent_variable_data : np.ndarray
        Shared independent variable as NumPy array.
    dependent_variable_data_list : list of np.ndarray
        List of dependent variables as NumPy arrays.
    conditions : dict[str, Any]
        Experimental conditions.
    details : dict[str, Any]
        Additional metadata.

    Raises
    ------
    ValueError
        If dependent_variable_data_list is empty, or if any channel has
        different length than independent_variable_data.

    Examples
    --------
    >>> import numpy as np
    >>> from piblin_jax.data.datasets import OneDimensionalCompositeDataset
    >>> # Multi-detector HPLC data
    >>> time = np.linspace(0, 20, 2000)  # minutes
    >>> uv_254 = np.sin(time) + 0.1 * np.random.randn(2000)
    >>> uv_280 = np.cos(time) + 0.1 * np.random.randn(2000)
    >>> fluorescence = np.sin(2 * time) + 0.05 * np.random.randn(2000)
    >>> hplc = OneDimensionalCompositeDataset(
    ...     independent_variable_data=time,
    ...     dependent_variable_data_list=[uv_254, uv_280, fluorescence],
    ...     conditions={"mobile_phase": "ACN/H2O 60:40", "flow_rate": 1.0},
    ...     details={
    ...         "channels": ["UV 254nm", "UV 280nm", "Fluorescence"],
    ...         "instrument": "HPLC-1"
    ...     }
    ... )
    >>> hplc.independent_variable_data.shape
    (2000,)
    >>> len(hplc.dependent_variable_data_list)
    3
    >>> hplc.dependent_variable_data_list[0].shape
    (2000,)

    >>> # Multi-channel oscilloscope data
    >>> t = np.linspace(0, 1, 10000)
    >>> ch1 = np.sin(2 * np.pi * 5 * t)
    >>> ch2 = np.sin(2 * np.pi * 10 * t)
    >>> ch3 = np.sin(2 * np.pi * 15 * t)
    >>> ch4 = np.sin(2 * np.pi * 20 * t)
    >>> scope_data = OneDimensionalCompositeDataset(
    ...     independent_variable_data=t,
    ...     dependent_variable_data_list=[ch1, ch2, ch3, ch4],
    ...     conditions={"sampling_rate": 10000},
    ...     details={"instrument": "oscilloscope", "channels": 4}
    ... )

    Notes
    -----
    This dataset type is useful when multiple measurements are made simultaneously
    along the same independent axis. Each channel is stored as a separate NumPy
    array in the list, allowing different processing or analysis on each channel
    while maintaining their shared relationship through the common independent
    variable.

    The internal storage uses backend arrays (JAX when available) and converts
    to NumPy at the property boundaries.
    """


[docs]
    def __init__(
        self,
        independent_variable_data: Any,
        dependent_variable_data_list: list[Any],
        conditions: dict[str, Any] | None = None,
        details: dict[str, Any] | None = None,
    ):
        """
        Initialize composite one-dimensional dataset.

        Parameters
        ----------
        independent_variable_data : array_like
            1D array of shared independent variable.
        dependent_variable_data_list : list of array_like
            List of 1D arrays for each channel.
        conditions : dict[str, Any] | None, optional
            Experimental conditions.
        details : dict[str, Any] | None, optional
            Additional metadata.

        Raises
        ------
        ValueError
            If list is empty or if any channel length doesn't match
            independent variable.
        """
        super().__init__(conditions=conditions, details=details)

        # Validation: must have at least one dependent variable
        if not dependent_variable_data_list or len(dependent_variable_data_list) == 0:
            raise ValueError(
                "OneDimensionalCompositeDataset requires at least one dependent variable. "
                "Got empty list."
            )

        # Convert independent variable to backend array
        self._independent_variable_data = jnp.asarray(independent_variable_data)
        expected_length = self._independent_variable_data.shape[0]

        # Convert all dependent variables to backend arrays and validate
        self._dependent_variable_data_list = []
        for i, dep_data in enumerate(dependent_variable_data_list):
            dep_array = jnp.asarray(dep_data)

            # Validation: each channel must match independent variable length
            if dep_array.shape[0] != expected_length:
                raise ValueError(
                    f"All dependent variables must have same length as independent variable. "
                    f"Independent variable has length {expected_length}, but "
                    f"dependent variable at index {i} has length {dep_array.shape[0]}"
                )

            self._dependent_variable_data_list.append(dep_array)


    @property
    def independent_variable_data(self) -> np.ndarray:
        """
        Get shared independent variable as NumPy array.

        Returns
        -------
        np.ndarray
            1D NumPy array of independent variable shared by all channels.

        Examples
        --------
        >>> dataset.independent_variable_data
        array([0., 0.01, 0.02, ..., 19.98, 19.99, 20.])
        """
        return to_numpy(self._independent_variable_data)

    @property
    def dependent_variable_data_list(self) -> list[np.ndarray]:
        """
        Get list of dependent variables as NumPy arrays.

        Returns
        -------
        list of np.ndarray
            List of 1D NumPy arrays, one for each channel/detector.

        Examples
        --------
        >>> len(dataset.dependent_variable_data_list)
        3
        >>> dataset.dependent_variable_data_list[0]  # First channel
        array([0.123, 0.145, ..., 0.234])
        >>> dataset.dependent_variable_data_list[1]  # Second channel
        array([0.456, 0.478, ..., 0.567])
        >>> # Process each channel
        >>> for i, channel in enumerate(dataset.dependent_variable_data_list):
        ...     print(f"Channel {i}: max = {channel.max():.3f}")
        Channel 0: max = 1.234
        Channel 1: max = 1.567
        Channel 2: max = 0.987
        """
        # Convert all backend arrays to NumPy
        return [to_numpy(dep) for dep in self._dependent_variable_data_list]