Source code for selfclean_audio.datasets.csem
# Copyright (c) Lucerne University of Applied Sciences and Arts.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
from pathlib import Path
import pandas as pd
import torch
from selfclean_audio.datasets.base import BaseAudioDataset
[docs]
class CSEMMembranePumps(BaseAudioDataset):
"""
CSEM Membrane Pump Audio Dataset loader.
Expects the following structure under ``root`` (see ``data/CSEM/README.md``)::
root/
files/
{guid}.wav
...
index.csv # columns: id, filename, label
The dataset returns tuples of (waveform, absolute_path, label).
``noisy_label`` is not known and will be set by the synthetic/noise wrappers
when applicable, otherwise considered 0 by downstream code.
"""
def __init__(
self,
root: str | Path,
convert_mono: bool = True,
sample_rate: int = 16000,
target_duration_sec: float | None = None,
index_file: str | Path = "index.csv",
files_dir: str | Path = "files",
):
super().__init__(
root=str(root),
convert_mono=convert_mono,
sample_rate=sample_rate,
target_duration_sec=target_duration_sec,
)
self.root = Path(root)
self.index_path = self.root / index_file
self.files_dir = self.root / files_dir
if not self.index_path.exists():
raise FileNotFoundError(f"CSEM index file not found: {self.index_path}")
if not self.files_dir.exists():
raise FileNotFoundError(f"CSEM files directory not found: {self.files_dir}")
self.df = pd.read_csv(self.index_path)
# Basic validation of expected columns
expected_cols = {"id", "filename", "label"}
missing = expected_cols - set(self.df.columns)
if missing:
raise ValueError(
f"CSEM index.csv is missing columns: {sorted(missing)}; "
f"found columns: {list(self.df.columns)}"
)
# Create a mapping of label ids to ensure consistent behavior downstream
# Labels are already integers per README; keep as-is
self.labels = self.df["label"].astype(int).tolist()
# Map for optional meta output in IssueManager
classes = sorted(set(self.labels))
self.idx_to_class = {idx: str(idx) for idx in classes}
def __len__(self) -> int:
return len(self.df)
def __getitem__(self, idx: int) -> tuple[torch.Tensor, str, int, torch.Tensor]:
row = self.df.iloc[idx]
guid = str(row["filename"]).strip()
# Files are stored as <guid>.wav
audio_path = self.files_dir / f"{guid}.wav"
if not audio_path.exists():
# Also allow bare GUIDs containing extension in index
alt_path = self.files_dir / guid
if alt_path.exists():
audio_path = alt_path
else:
raise FileNotFoundError(
f"Audio file not found for row {idx}: {audio_path}"
)
label = int(row["label"]) # already numeric per README
waveform, _ = self._load_and_preprocess_audio(audio_path)
return waveform, str(audio_path), label, torch.tensor(0)
[docs]
def get_errors(self):
"""Return dummy ground truth for CSEM dataset (no ground truth available).
Returns empty lists to bypass scoring requirements while allowing
ranking generation to proceed.
"""
# For near duplicates: return empty pairs
# For other issues: return empty error indicators
return [], []