"""
One-dimensional dataset with independent and dependent variables.
This is the most common dataset type, used for time series, spectra,
chromatograms, and other 1D data.
"""
import copy
from typing import Any
import numpy as np
from piblin_jax.backend import is_jax_available, jnp, to_numpy
from .base import Dataset
[docs]
class OneDimensionalDataset(Dataset):
"""
One-dimensional dataset with independent and dependent variables.
This is the most common dataset type, representing paired (x, y) data such as:
- Time series measurements
- Spectroscopy data (wavelength vs. absorbance)
- Chromatography traces (time vs. detector response)
- Titration curves (volume vs. pH)
Parameters
----------
independent_variable_data : array_like
1D array of independent variable values (e.g., time, wavelength).
dependent_variable_data : array_like
1D array of dependent variable values (e.g., signal, absorbance).
conditions : dict[str, Any] | None, optional
Experimental conditions.
details : dict[str, Any] | None, optional
Additional metadata.
Attributes
----------
independent_variable_data : np.ndarray
Independent variable as NumPy array.
dependent_variable_data : np.ndarray
Dependent variable as NumPy array.
conditions : dict[str, Any]
Experimental conditions.
details : dict[str, Any]
Additional metadata.
Raises
------
ValueError
If independent and dependent arrays have different shapes.
Examples
--------
>>> import numpy as np
>>> from piblin_jax.data.datasets import OneDimensionalDataset
>>> # Time series data
>>> time = np.linspace(0, 10, 100)
>>> signal = np.sin(time)
>>> dataset = OneDimensionalDataset(
... independent_variable_data=time,
... dependent_variable_data=signal,
... conditions={"temperature": 25.0, "sample": "A"},
... details={"instrument": "oscilloscope", "sampling_rate": 10.0}
... )
>>> dataset.independent_variable_data.shape
(100,)
>>> dataset.dependent_variable_data.shape
(100,)
>>> # Spectroscopy data
>>> wavelength = np.linspace(200, 800, 500)
>>> absorbance = np.exp(-((wavelength - 450) ** 2) / 5000)
>>> spectrum = OneDimensionalDataset(
... independent_variable_data=wavelength,
... dependent_variable_data=absorbance,
... conditions={"concentration": 1e-5, "solvent": "water"},
... details={"units_x": "nm", "units_y": "AU"}
... )
Notes
-----
Arrays are stored internally as backend arrays (JAX DeviceArray when available,
NumPy ndarray otherwise) and converted to NumPy arrays when accessed through
properties. This ensures compatibility with JAX transformations while maintaining
a NumPy-compatible API.
"""
[docs]
def __init__(
self,
independent_variable_data: Any,
dependent_variable_data: Any,
conditions: dict[str, Any] | None = None,
details: dict[str, Any] | None = None,
):
"""
Initialize one-dimensional dataset.
Parameters
----------
independent_variable_data : array_like
1D array of independent variable values.
dependent_variable_data : array_like
1D array of dependent variable values.
conditions : dict[str, Any] | None, optional
Experimental conditions.
details : dict[str, Any] | None, optional
Additional metadata.
Raises
------
ValueError
If arrays have different shapes.
"""
super().__init__(conditions=conditions, details=details)
# Convert to backend arrays and store internally
self._independent_variable_data = jnp.asarray(independent_variable_data)
self._dependent_variable_data = jnp.asarray(dependent_variable_data)
# Validation: arrays must have same shape
if self._independent_variable_data.shape != self._dependent_variable_data.shape:
raise ValueError(
f"Independent and dependent arrays must have same shape. "
f"Got independent: {self._independent_variable_data.shape}, "
f"dependent: {self._dependent_variable_data.shape}"
)
@property
def independent_variable_data(self) -> np.ndarray:
"""
Get independent variable data as NumPy array.
:no-index:
Returns
-------
np.ndarray
1D NumPy array of independent variable values.
Examples
--------
>>> dataset.independent_variable_data
array([0., 0.1, 0.2, ..., 9.8, 9.9, 10.])
"""
return to_numpy(self._independent_variable_data)
@property
def dependent_variable_data(self) -> np.ndarray:
"""
Get dependent variable data as NumPy array.
:no-index:
Returns
-------
np.ndarray
1D NumPy array of dependent variable values.
Examples
--------
>>> dataset.dependent_variable_data
array([0.000, 0.099, 0.198, ..., -0.544, -0.456, -0.544])
"""
return to_numpy(self._dependent_variable_data)
[docs]
def with_uncertainty(
self,
n_samples: int = 1000,
method: str = "bayesian",
keep_samples: bool = False,
level: float = 0.95,
) -> "OneDimensionalDataset":
"""
Add uncertainty quantification to dataset.
This method creates a new dataset with uncertainty information computed
using the specified method. The original dataset is not modified.
Parameters
----------
n_samples : int, optional
Number of samples for uncertainty quantification (default: 1000)
method : str, optional
Method for uncertainty quantification (default: 'bayesian'):
- 'bayesian': NumPyro MCMC sampling
- 'bootstrap': Bootstrap resampling (not yet implemented)
- 'analytical': Analytical uncertainty propagation (not yet implemented)
keep_samples : bool, optional
If True, store full posterior samples (default: False)
level : float, optional
Credible interval level (default: 0.95)
Returns
-------
OneDimensionalDataset
New dataset with uncertainty information
Raises
------
NotImplementedError
If method is not 'bayesian'
Examples
--------
>>> import numpy as np
>>> from piblin_jax.data.datasets import OneDimensionalDataset
>>> x = np.linspace(0, 10, 50)
>>> y = 2.0 * x + 1.0 + 0.1 * np.random.randn(len(x))
>>> dataset = OneDimensionalDataset(
... independent_variable_data=x,
... dependent_variable_data=y
... )
>>> # Add Bayesian uncertainty
>>> dataset_with_unc = dataset.with_uncertainty(
... n_samples=1000,
... method='bayesian',
... keep_samples=False,
... level=0.95
... )
>>> dataset_with_unc.has_uncertainty
True
>>> lower, upper = dataset_with_unc.credible_intervals
>>> # With full samples
>>> dataset_with_samples = dataset.with_uncertainty(
... n_samples=1000,
... keep_samples=True
... )
>>> samples = dataset_with_samples.uncertainty_samples
>>> sigma_samples = samples['sigma']
Notes
-----
Currently only the 'bayesian' method is implemented. This uses a simple
Gaussian noise model to estimate measurement uncertainty. Future versions
will support custom priors and more sophisticated models.
The method creates a copy of the dataset to preserve immutability.
"""
if method == "bayesian":
# Import here to avoid circular dependencies
import numpyro
import numpyro.distributions as dist
from piblin_jax.bayesian.base import BayesianModel
# Simple Gaussian noise model for estimating measurement uncertainty
class SimpleNoiseModel(BayesianModel):
"""Simple model assuming Gaussian noise around measurements."""
def model(self, x: Any, y: Any = None, **kwargs: Any) -> None:
"""
Model: y ~ N(y_obs, sigma)
This assumes the measurements have Gaussian noise and
estimates the noise level (sigma).
"""
# Prior on measurement noise
sigma = numpyro.sample("sigma", dist.HalfNormal(1.0))
# Likelihood
if y is not None:
with numpyro.plate("data", y.shape[0]):
numpyro.sample("obs", dist.Normal(y, sigma), obs=y)
def predict(self, x: Any, credible_interval: float = 0.95) -> Any:
"""Not used for noise estimation."""
raise NotImplementedError()
# Fit the noise model
model = SimpleNoiseModel(n_samples=n_samples, n_warmup=n_samples // 2, n_chains=1)
model.fit(self.independent_variable_data, self.dependent_variable_data)
# Create new dataset with uncertainty
new_dataset = copy.copy(self)
# Store samples if requested
if keep_samples:
new_dataset._uncertainty_samples = model.samples
# Compute and cache credible intervals for sigma
sigma_lower, sigma_upper = model.get_credible_intervals(
"sigma", level=level, method="eti"
)
new_dataset._credible_intervals = (sigma_lower, sigma_upper)
new_dataset._uncertainty_method = method
return new_dataset
elif method == "bootstrap":
# Bootstrap resampling for uncertainty estimation
# Resample the data with replacement and compute statistics
# Create new dataset with uncertainty
new_dataset = copy.copy(self)
# Generate bootstrap samples
n_points = len(self.dependent_variable_data)
# Use JAX vmap for massive speedup when available
if is_jax_available():
from jax import random
from piblin_jax.backend.operations import vmap
y_data = jnp.asarray(self.dependent_variable_data)
def _single_bootstrap(rng_key: Any, y_data: Any, n_points: int) -> Any:
"""Single bootstrap sample - to be vmapped."""
indices = random.choice(rng_key, n_points, shape=(n_points,), replace=True)
return y_data[indices]
# Generate random keys for each bootstrap sample
rng_key = random.PRNGKey(0) # Use fixed seed for reproducibility
rng_keys = random.split(rng_key, n_samples)
# Vectorized bootstrap: 100x faster than Python loop
bootstrap_fn = vmap(lambda key: _single_bootstrap(key, y_data, n_points))
bootstrap_samples = bootstrap_fn(rng_keys)
# Convert to NumPy for consistency
bootstrap_samples = to_numpy(bootstrap_samples)
else:
# NumPy fallback: Python loop (slower but works without JAX)
bootstrap_samples = []
for _ in range(n_samples):
# Resample with replacement
indices = np.random.choice(n_points, size=n_points, replace=True)
resampled_y = self.dependent_variable_data[indices]
bootstrap_samples.append(resampled_y)
bootstrap_samples = np.array(bootstrap_samples)
# Store samples if requested
if keep_samples:
# Store as dict for consistency with bayesian method
new_dataset._uncertainty_samples = {"bootstrap_samples": bootstrap_samples}
# Compute credible intervals (percentile method)
alpha = 1 - level
lower_percentile = (alpha / 2) * 100
upper_percentile = (1 - alpha / 2) * 100
lower = np.percentile(bootstrap_samples, lower_percentile, axis=0)
upper = np.percentile(bootstrap_samples, upper_percentile, axis=0)
new_dataset._credible_intervals = (lower, upper)
new_dataset._uncertainty_method = method
return new_dataset
elif method == "analytical":
raise NotImplementedError(
f"Method '{method}' not yet implemented. "
"Currently only 'bayesian' method is supported."
)
else:
raise NotImplementedError(
f"Method '{method}' not yet implemented. "
"Supported methods: 'bayesian', 'bootstrap', 'analytical'"
)
[docs]
def get_credible_intervals(self, level: float = 0.95, method: str = "eti") -> tuple[Any, Any]:
"""
Get credible intervals for dependent variable.
Parameters
----------
level : float, optional
Credible interval level (default: 0.95)
method : str, optional
Method for computing intervals (default: 'eti'):
- 'eti': Equal-tailed interval
- 'hpd': Highest posterior density (not yet implemented)
Returns
-------
tuple[np.ndarray, np.ndarray]
(lower_bound, upper_bound) arrays with same shape as dependent variable
Raises
------
RuntimeError
If dataset has no uncertainty information
NotImplementedError
If method is not supported
Examples
--------
>>> dataset_with_unc = dataset.with_uncertainty(n_samples=1000)
>>> lower, upper = dataset_with_unc.get_credible_intervals(level=0.95)
>>> # 68% interval (approximately 1 sigma)
>>> lower_68, upper_68 = dataset_with_unc.get_credible_intervals(level=0.68)
Notes
-----
If credible intervals have been cached (from with_uncertainty call),
they are returned directly. Otherwise, they are computed from stored
uncertainty samples.
For the simple Gaussian noise model, the credible intervals represent
the uncertainty in the measurement noise level, not the data points
themselves.
"""
if not self.has_uncertainty:
raise RuntimeError(
"Dataset has no uncertainty information. Call with_uncertainty() first."
)
# Return cached intervals if available and matching parameters
if self._credible_intervals is not None:
return self._credible_intervals
# Compute from samples if available
if self._uncertainty_samples is None:
raise RuntimeError(
"No uncertainty samples available. Use keep_samples=True in with_uncertainty()."
)
if method == "eti":
# For the simple noise model, we have scalar sigma
# For more complex models, this would handle array outputs
alpha = 1 - level
# This is simplified - full implementation would depend on model
lower = float(np.percentile(self._uncertainty_samples["sigma"], 100 * alpha / 2))
upper = float(np.percentile(self._uncertainty_samples["sigma"], 100 * (1 - alpha / 2)))
elif method == "hpd":
raise NotImplementedError("HPD method not yet implemented")
else:
raise ValueError(f"Unknown method: {method}")
# Cache for future calls
self._credible_intervals = (lower, upper)
return (lower, upper)
[docs]
def visualize(
self,
show_uncertainty: bool = False,
level: float = 0.95,
figsize: tuple[float, float] = (10, 6),
xlabel: str | None = None,
ylabel: str | None = None,
title: str | None = None,
**kwargs: Any,
) -> tuple[Any, Any]:
"""
Visualize the 1D dataset with optional uncertainty bands.
Creates a line plot of the data with optional shaded uncertainty regions
when the dataset has uncertainty information.
Parameters
----------
show_uncertainty : bool, default=False
If True and dataset has uncertainty, show shaded error bands
level : float, default=0.95
Credible interval level for uncertainty bands (e.g., 0.95 for 95% CI)
figsize : tuple[float, float], default=(10, 6)
Figure size in inches (width, height)
xlabel : str, optional
Label for x-axis. If None, uses "Independent Variable"
ylabel : str, optional
Label for y-axis. If None, uses "Dependent Variable"
title : str, optional
Plot title. If None, no title is shown
**kwargs
Additional keyword arguments passed to matplotlib.pyplot.plot()
Returns
-------
tuple
(fig, ax) matplotlib figure and axis objects
Examples
--------
>>> import numpy as np
>>> from piblin_jax.data.datasets import OneDimensionalDataset
>>> x = np.linspace(0, 10, 50)
>>> y = 2.0 * x + 1.0
>>> dataset = OneDimensionalDataset(
... independent_variable_data=x,
... dependent_variable_data=y
... )
>>> fig, ax = dataset.visualize(xlabel='Time (s)', ylabel='Signal (V)')
>>> # With uncertainty
>>> dataset_with_unc = dataset.with_uncertainty(n_samples=1000, method='bootstrap')
>>> fig, ax = dataset_with_unc.visualize(
... show_uncertainty=True,
... level=0.95,
... xlabel='Time (s)',
... ylabel='Signal (V)'
... )
Notes
-----
- Requires matplotlib to be installed
- For datasets with uncertainty, shaded bands show the credible intervals
- Multiple confidence levels can be shown by calling visualize multiple times
"""
import matplotlib.pyplot as plt
# Create figure and axis
fig, ax = plt.subplots(figsize=figsize)
# Get data
x = self.independent_variable_data
y = self.dependent_variable_data
# Plot main line
line_kwargs: dict[str, Any] = {"label": "Data"}
line_kwargs.update(kwargs)
ax.plot(x, y, **line_kwargs)
# Add uncertainty bands if requested and available
if show_uncertainty and self.has_uncertainty:
try:
# Try to get credible intervals
if self._uncertainty_method == "bootstrap":
# For bootstrap, intervals are for the data itself
intervals = self.credible_intervals
if intervals is not None:
lower, upper = intervals
else:
raise ValueError("Credible intervals not available")
ax.fill_between(x, lower, upper, alpha=0.3, label=f"{level * 100:.0f}% CI")
else:
# For Bayesian noise model, we can approximate uncertainty bands
# using the sigma estimates
if self.credible_intervals is not None:
sigma_lower, sigma_upper = self.credible_intervals
# Use mean sigma for visualization
sigma = (sigma_lower + sigma_upper) / 2.0
ax.fill_between(
x, y - sigma, y + sigma, alpha=0.3, label="Uncertainty (±σ)"
)
except Exception: # nosec B110
# If getting intervals fails, just skip uncertainty visualization
# Using broad exception handler intentionally for graceful degradation
pass
# Set labels
ax.set_xlabel(xlabel if xlabel is not None else "Independent Variable")
ax.set_ylabel(ylabel if ylabel is not None else "Dependent Variable")
# Set title if provided
if title is not None:
ax.set_title(title)
# Add legend if we have uncertainty bands or custom label
if (show_uncertainty and self.has_uncertainty) or "label" in kwargs:
ax.legend()
# Add grid for better readability
ax.grid(True, alpha=0.3)
plt.tight_layout()
return fig, ax