Source code for piblin_jax.data.metadata
"""Metadata utilities for managing, validating, extracting, and merging metadata.
This module provides utilities for working with metadata (conditions and details)
across the data hierarchy. Metadata is separated into:
- **Conditions**: Experimental parameters that define comparability between datasets
(e.g., temperature, pressure, concentration)
- **Details**: Contextual information that doesn't affect experimental conditions
(e.g., operator, date, notes)
The module supports:
- Merging metadata from multiple sources with configurable conflict resolution
- Separating conditions from details using explicit keys or heuristics
- Validating metadata against schemas with type checking
- Extracting metadata from filenames, paths, and file headers
"""
import re
from collections.abc import Callable
from pathlib import Path
from typing import Any
[docs]
def merge_metadata(
metadata_list: list[dict[str, Any]], strategy: str = "override"
) -> dict[str, Any]:
"""Merge multiple metadata dictionaries.
Combines metadata from multiple sources with configurable conflict resolution.
Metadata dictionaries are processed in order, with later dictionaries having
higher priority (for 'override' strategy).
Parameters
----------
metadata_list : list[dict[str, Any]]
List of metadata dictionaries to merge (in priority order).
Earlier dictionaries have lower priority for conflict resolution.
strategy : str, optional
Conflict resolution strategy (default: "override"):
- 'override': Later values override earlier ones
- 'keep_first': Keep first value encountered
- 'raise': Raise ValueError on conflicts
- 'list': Collect conflicting values in a list (duplicates removed)
Returns
-------
dict[str, Any]
Merged metadata dictionary
Raises
------
ValueError
If strategy is 'raise' and conflicts are detected, or if strategy
is unknown
Examples
--------
>>> meta1 = {"temp": 20, "sample": "A1"}
>>> meta2 = {"temp": 25, "pressure": 1.0}
>>> merge_metadata([meta1, meta2])
{'temp': 25, 'sample': 'A1', 'pressure': 1.0}
>>> merge_metadata([meta1, meta2], strategy="keep_first")
{'temp': 20, 'sample': 'A1', 'pressure': 1.0}
>>> merge_metadata([meta1, meta2], strategy="list")
{'temp': [20, 25], 'sample': 'A1', 'pressure': 1.0}
"""
if not metadata_list:
return {}
result = {}
for metadata in metadata_list:
for key, value in metadata.items():
if key not in result:
result[key] = value
else:
# Conflict detected
if strategy == "override":
result[key] = value
elif strategy == "keep_first":
pass # Keep existing value
elif strategy == "raise":
if result[key] != value:
raise ValueError(
f"Metadata conflict for key '{key}': {result[key]} vs {value}"
)
elif strategy == "list":
if isinstance(result[key], list):
if value not in result[key]:
result[key].append(value)
else:
if result[key] != value:
result[key] = [result[key], value]
else:
raise ValueError(f"Unknown strategy: {strategy}")
return result
[docs]
def separate_conditions_details(
metadata: dict[str, Any], condition_keys: list[str] | None = None
) -> tuple[dict[str, Any], dict[str, Any]]:
"""Separate metadata into conditions and details.
Conditions are experimental parameters that define comparability between
datasets (e.g., temperature, pressure). Details are contextual information
(e.g., operator, date, notes).
Parameters
----------
metadata : dict[str, Any]
Combined metadata dictionary
condition_keys : list[str] | None, optional
Known condition keys (experimental parameters).
If None, heuristics are used to identify conditions based on
common experimental parameter names.
Returns
-------
conditions : dict[str, Any]
Experimental conditions (parameters defining comparability)
details : dict[str, Any]
Context information (non-experimental metadata)
Examples
--------
>>> metadata = {"temp": 25, "pressure": 1.0, "operator": "John"}
>>> conditions, details = separate_conditions_details(
... metadata,
... condition_keys=["temp", "pressure"]
... )
>>> conditions
{'temp': 25, 'pressure': 1.0}
>>> details
{'operator': 'John'}
Using heuristics:
>>> metadata = {"temperature": 25, "strain": 0.1, "notes": "Trial 1"}
>>> conditions, details = separate_conditions_details(metadata)
>>> "temperature" in conditions
True
>>> "notes" in details
True
"""
if condition_keys is None:
# Use heuristics to identify conditions
# Conditions typically: temperature, pressure, concentration, etc.
condition_key_patterns = [
"temp",
"temperature",
"pressure",
"concentration",
"frequency",
"strain",
"stress",
"time",
"wavelength",
"ph",
"humidity",
"voltage",
"current",
"power",
]
condition_keys = [
key
for key in metadata
if any(pattern in key.lower() for pattern in condition_key_patterns)
]
conditions = {k: v for k, v in metadata.items() if k in condition_keys}
details = {k: v for k, v in metadata.items() if k not in condition_keys}
return conditions, details
[docs]
def validate_metadata(
metadata: dict[str, Any],
schema: dict[str, type | Callable[[Any], bool]] | None = None,
required_keys: list[str] | None = None,
) -> bool:
"""Validate metadata against a schema.
Performs type checking and required key validation. Validation is optional
and can be configured with schema and required_keys parameters.
Parameters
----------
metadata : dict[str, Any]
Metadata to validate
schema : dict[str, type | Callable[[Any], bool]] | None, optional
Schema defining expected types or validation functions.
Keys are metadata field names, values are either:
- Type objects (e.g., float, str, int) for type checking
- Callable validators that return True if valid
Example: ``{'temperature': float, 'sample_id': str}``
required_keys : list[str] | None, optional
Keys that must be present in metadata
Returns
-------
bool
True if valid
Raises
------
ValueError
If validation fails (missing required keys, type mismatch,
or custom validation function returns False)
Examples
--------
Type checking:
>>> metadata = {"temp": 25.0, "sample": "A1"}
>>> schema = {"temp": float, "sample": str}
>>> validate_metadata(metadata, schema=schema)
True
Required keys:
>>> validate_metadata(metadata, required_keys=["temp", "sample"])
True
Custom validation:
>>> schema = {"ph": lambda x: 0 <= x <= 14}
>>> validate_metadata({"ph": 7.0}, schema=schema)
True
"""
# Check required keys
if required_keys:
missing = set(required_keys) - set(metadata.keys())
if missing:
raise ValueError(f"Missing required metadata keys: {missing}")
# Type checking
if schema:
for key, expected_type in schema.items():
if key in metadata:
value = metadata[key]
if isinstance(expected_type, type):
if not isinstance(value, expected_type):
raise ValueError(
f"Metadata '{key}' has incorrect type: "
f"expected {expected_type.__name__}, "
f"got {type(value).__name__}"
)
elif callable(expected_type):
# Custom validation function
if not expected_type(value):
raise ValueError(f"Metadata '{key}' failed validation")
return True
[docs]
def parse_key_value_string(text: str, separator: str = "=", delimiter: str = ",") -> dict[str, str]:
"""Parse key-value pairs from a string.
Extracts metadata from delimited key-value strings commonly found in
filenames, headers, or configuration strings.
Parameters
----------
text : str
String containing key-value pairs.
Example: ``"temp=25,pressure=1.0,sample=A1"``
separator : str, optional
Character separating keys from values (default: "=")
delimiter : str, optional
Character separating pairs (default: ",")
Returns
-------
dict[str, str]
Parsed metadata (all values are strings, convert as needed)
Examples
--------
>>> parse_key_value_string("temp=25,pressure=1.0")
{'temp': '25', 'pressure': '1.0'}
>>> parse_key_value_string("temp:25;pressure:1.0", separator=":", delimiter=";")
{'temp': '25', 'pressure': '1.0'}
"""
metadata = {}
pairs = text.split(delimiter)
for pair in pairs:
pair = pair.strip()
if separator in pair:
key, value = pair.split(separator, 1)
metadata[key.strip()] = value.strip()
return metadata
[docs]
def extract_from_filename(filename: str | Path, pattern: str | None = None) -> dict[str, str]:
"""Extract metadata from filename using regex pattern.
Parses filenames to extract metadata using either custom regex patterns
or common heuristics for scientific data files.
Parameters
----------
filename : str | Path
Filename or path (extension is removed before matching)
pattern : str | None, optional
Regex pattern with named groups for extraction.
If None, uses common heuristics for sample names, temperatures,
and replicate numbers.
Returns
-------
dict[str, str]
Extracted metadata (all values are strings)
Examples
--------
Using heuristics:
>>> extract_from_filename("sample_A1_temp_25C_001.csv")
{'sample': 'A1', 'temp': '25', 'replicate': '001'}
Using custom pattern:
>>> pattern = r"(?P<sample>\\w+)_(?P<temp>\\d+)C"
>>> extract_from_filename("sample_A1_25C.csv", pattern)
{'sample': 'A1', 'temp': '25'}
"""
if isinstance(filename, Path):
filename = filename.stem # Remove extension
else:
filename = Path(filename).stem
metadata = {}
if pattern:
match = re.search(pattern, filename)
if match:
metadata = match.groupdict()
else:
# Heuristic patterns for common scientific filename conventions
# Use non-greedy matching and word boundaries
patterns = [
r"sample[_-]?(?P<sample>[A-Za-z0-9]+?)(?:[_-]|$)",
r"temp[_-]?(?P<temp>\d+\.?\d*)",
r"(?P<replicate>\d{3,})$", # Trailing numbers (3+ digits)
]
for p in patterns:
match = re.search(p, filename, re.IGNORECASE)
if match:
metadata.update(match.groupdict())
return metadata
[docs]
def extract_from_path(filepath: str | Path, level_names: list[str] | None = None) -> dict[str, str]:
"""Extract metadata from directory structure.
Parses directory hierarchy to extract metadata based on directory names
at different levels.
Parameters
----------
filepath : str | Path
File path
level_names : list[str] | None, optional
Names for each directory level (from deepest to root).
Example: ``['sample', 'experiment', 'project']`` extracts
sample from parent directory, experiment from grandparent, etc.
If None, returns empty dict.
Returns
-------
dict[str, str]
Extracted metadata
Examples
--------
>>> extract_from_path(
... "/data/ProjectA/ExpB/SampleC/data.csv",
... ['sample', 'experiment', 'project']
... )
{'sample': 'SampleC', 'experiment': 'ExpB', 'project': 'ProjectA'}
"""
path = Path(filepath)
parts = path.parts[:-1] # Exclude filename
metadata = {}
if level_names and parts:
for i, name in enumerate(level_names):
if i < len(parts):
metadata[name] = parts[-(i + 1)]
return metadata
[docs]
def parse_header_metadata(
header_lines: list[str], comment_char: str = "#", separator: str = ":"
) -> dict[str, str]:
"""Parse metadata from file header comment lines.
Extracts metadata from comment lines in file headers, commonly used in
scientific data files to store experimental conditions and context.
Parameters
----------
header_lines : list[str]
Lines from file header
comment_char : str, optional
Comment character (default: "#")
separator : str, optional
Character separating keys from values (default: ":")
Returns
-------
dict[str, str]
Parsed metadata (all values are strings)
Examples
--------
>>> lines = [
... "# Temperature: 25",
... "# Pressure: 1.0",
... "# Sample: A1"
... ]
>>> parse_header_metadata(lines)
{'Temperature': '25', 'Pressure': '1.0', 'Sample': 'A1'}
With custom separators:
>>> lines = ["// Temp = 25", "// Sample = A1"]
>>> parse_header_metadata(lines, comment_char="//", separator="=")
{'Temp': '25', 'Sample': 'A1'}
"""
metadata = {}
for line in header_lines:
line = line.strip()
if line.startswith(comment_char):
line = line[len(comment_char) :].strip()
if separator in line:
key, value = line.split(separator, 1)
metadata[key.strip()] = value.strip()
return metadata