Source code for selfclean_audio.datasets.csem

# Copyright (c) Lucerne University of Applied Sciences and Arts.
# All rights reserved.

# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.


from pathlib import Path

import pandas as pd
import torch

from selfclean_audio.datasets.base import BaseAudioDataset


[docs] class CSEMMembranePumps(BaseAudioDataset): """ CSEM Membrane Pump Audio Dataset loader. Expects the following structure under ``root`` (see ``data/CSEM/README.md``):: root/ files/ {guid}.wav ... index.csv # columns: id, filename, label The dataset returns tuples of (waveform, absolute_path, label). ``noisy_label`` is not known and will be set by the synthetic/noise wrappers when applicable, otherwise considered 0 by downstream code. """ def __init__( self, root: str | Path, convert_mono: bool = True, sample_rate: int = 16000, target_duration_sec: float | None = None, index_file: str | Path = "index.csv", files_dir: str | Path = "files", ): super().__init__( root=str(root), convert_mono=convert_mono, sample_rate=sample_rate, target_duration_sec=target_duration_sec, ) self.root = Path(root) self.index_path = self.root / index_file self.files_dir = self.root / files_dir if not self.index_path.exists(): raise FileNotFoundError(f"CSEM index file not found: {self.index_path}") if not self.files_dir.exists(): raise FileNotFoundError(f"CSEM files directory not found: {self.files_dir}") self.df = pd.read_csv(self.index_path) # Basic validation of expected columns expected_cols = {"id", "filename", "label"} missing = expected_cols - set(self.df.columns) if missing: raise ValueError( f"CSEM index.csv is missing columns: {sorted(missing)}; " f"found columns: {list(self.df.columns)}" ) # Create a mapping of label ids to ensure consistent behavior downstream # Labels are already integers per README; keep as-is self.labels = self.df["label"].astype(int).tolist() # Map for optional meta output in IssueManager classes = sorted(set(self.labels)) self.idx_to_class = {idx: str(idx) for idx in classes} def __len__(self) -> int: return len(self.df) def __getitem__(self, idx: int) -> tuple[torch.Tensor, str, int, torch.Tensor]: row = self.df.iloc[idx] guid = str(row["filename"]).strip() # Files are stored as <guid>.wav audio_path = self.files_dir / f"{guid}.wav" if not audio_path.exists(): # Also allow bare GUIDs containing extension in index alt_path = self.files_dir / guid if alt_path.exists(): audio_path = alt_path else: raise FileNotFoundError( f"Audio file not found for row {idx}: {audio_path}" ) label = int(row["label"]) # already numeric per README waveform, _ = self._load_and_preprocess_audio(audio_path) return waveform, str(audio_path), label, torch.tensor(0)
[docs] def get_errors(self): """Return dummy ground truth for CSEM dataset (no ground truth available). Returns empty lists to bypass scoring requirements while allowing ranking generation to proceed. """ # For near duplicates: return empty pairs # For other issues: return empty error indicators return [], []