ingest: tag bandcamp downloads and correct label-account artists

yt-dlp's flat extraction mis-attributes bandcamp uploads: on a label account the artist becomes the label, and many files ship untagged. After a download, guess (artist, title, album) from the richer download metadata and the "Artist - Title" filename shape, confirm against MusicBrainz, and write ID3 tags into the file. Existing tags are respected; MusicBrainz is primary, filename parsing only a fallback. The file's tags are the single source of truth: the provider reads them back on the next pick, so the corrected identity (incl. MBID) is in place before the scheduler keys and anti-repeat-checks the track, and cache reuse inherits it too. Fetchers now return (path, Track) so the corrected metadata flows back. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-07-03 11:01:24 +08:00 · 2026-07-03 11:01:24 +08:00 · fbdb2d6bb3
commit fbdb2d6bb3
parent aad3c9d0f7
8 changed files with 346 additions and 36 deletions
--- a/ingest/radieo/main.py
+++ b/ingest/radieo/main.py
@ -19,14 +19,18 @@ class _NullCanonicalizer:
    def canonicalize(self, track):
        return track
    def identify(self, artist, title):
        return None, None, None
    def close(self):
        pass
-def _build_pipeline(db: Database):
+def _build_pipeline(db: Database, canonicalizer):
    """Return (providers, fetchers). Assembles whichever sources are enabled;
    when none is, the scheduler yields nothing and the stream plays its local
-    cache fallback."""
+    cache fallback. ``canonicalizer`` is handed to the yt-dlp fetcher so it can
    confirm guessed bandcamp tags against MusicBrainz."""
    providers = []  # list[(provider, weight)]
    fetchers = {}  # backend name -> fetcher
    subsonic_client = None  # reused by ListenBrainz for resolution
@ -54,9 +58,9 @@ def _build_pipeline(db: Database):
        from .fetchers.ytdlp import YtdlpFetcher
        from .providers.ytdlp import YtdlpProvider
-        provider = YtdlpProvider(config.YTDLP_URLS_FILE, db)
+        provider = YtdlpProvider(config.YTDLP_URLS_FILE, db, config.CACHE_DIR)
        providers.append((provider, config.SOURCE_WEIGHTS["ytdlp"]))
-        fetchers["ytdlp"] = YtdlpFetcher(config.CACHE_DIR)
+        fetchers["ytdlp"] = YtdlpFetcher(config.CACHE_DIR, canonicalizer)
        log.info(
            "yt-dlp source enabled (urls=%s, weight=%d)",
            config.YTDLP_URLS_FILE,
@ -78,7 +82,7 @@ def _build_pipeline(db: Database):
        if "ytdlp" not in fetchers:
            from .fetchers.ytdlp import YtdlpFetcher
-            fetchers["ytdlp"] = YtdlpFetcher(config.CACHE_DIR)
+            fetchers["ytdlp"] = YtdlpFetcher(config.CACHE_DIR, canonicalizer)
        provider = ListenBrainzProvider(
            config.LISTENBRAINZ_URL,
            db,
@ -148,7 +152,7 @@ def main() -> None:
        canonicalizer = _NullCanonicalizer()
        log.info("Canonicalizer disabled: tracks keyed by (artist, title).")
-    providers, fetchers = _build_pipeline(db)
+    providers, fetchers = _build_pipeline(db, canonicalizer)
    scheduler = Scheduler(providers, canonicalizer, db)
    queue = TrackQueue(scheduler, fetchers, db)
    queue.start()
--- a/ingest/radieo/canonicalizer.py
+++ b/ingest/radieo/canonicalizer.py
@ -49,15 +49,49 @@ class Canonicalizer:
        cached, mbid = self._db.get_canonical(artist_norm, title_norm)
        if not cached:
-            ok, mbid = self._lookup(track.artist, track.title)
+            ok, rec = self._search(track.artist, track.title)
            if ok:  # cache hits and genuine misses; skip transient errors
                mbid = rec.get("id") if rec else None
                log.info(
                    "%s for %s — %s",
                    f"MBID {mbid}" if mbid else "no confident MBID",
                    track.artist,
                    track.title,
                )
                self._db.put_canonical(artist_norm, title_norm, mbid)
        return replace(track, mbid=mbid) if mbid else track
    def identify(
        self, artist: str, title: str
    ) -> tuple[str | None, str | None, str | None]:
        """Best-effort MusicBrainz identification for tagging.
        Returns ``(mbid, canonical_artist, canonical_title)``; any field may be
        None. Unlike :meth:`canonicalize` it also hands back the recording's
        canonical names so a caller can clean up a mis-tagged file. Warms the
        MBID cache as a side effect but is not itself cached, so call it at most
        once per download.
        """
        artist_norm = norm_name(artist)
        title_norm = norm_name(title)
        if not artist_norm or not title_norm:
            return None, None, None
        ok, rec = self._search(artist, title)
        if not ok or not rec:
            return None, None, None
        mbid = rec.get("id")
        if mbid:
            self._db.put_canonical(artist_norm, title_norm, mbid)
        return mbid, _artist_credit(rec), rec.get("title")
    # --- MusicBrainz ------------------------------------------------------
-    def _lookup(self, artist: str, title: str) -> tuple[bool, str | None]:
+    def _search(self, artist: str, title: str) -> tuple[bool, dict | None]:
-        """Return (ok, mbid). ``ok`` False on a transient error (do not cache)."""
+        """Query MusicBrainz for the best recording match above the threshold.
        Returns ``(ok, recording)``; ``ok`` False on a transient error (caller
        must not cache), ``recording`` None on a genuine no-confident-match.
        """
        query = f'artist:"{_escape(artist)}" AND recording:"{_escape(title)}"'
        self._throttle()
        try:
@ -72,14 +106,9 @@ class Canonicalizer:
            return False, None
        recordings = data.get("recordings") or []
-        if recordings:
+        if recordings and int(recordings[0].get("score", 0)) >= config.CANONICAL_MIN_SCORE:
-            best = recordings[0]
+            return True, recordings[0]
-            if int(best.get("score", 0)) >= config.CANONICAL_MIN_SCORE:
+        return True, None
                mbid = best.get("id")
                log.info("MBID %s for %s — %s", mbid, artist, title)
                return True, mbid
        log.info("no confident MBID for %s — %s", artist, title)
        return True, None  # genuine miss: cache it
    def _throttle(self) -> None:
        with self._rate_lock:
@ -96,3 +125,22 @@ class Canonicalizer:
 # quoted phrase.
 def _escape(value: str) -> str:
    return value.replace("\\", "\\\\").replace('"', '\\"')
 def _artist_credit(rec: dict) -> str | None:
    """Flatten a recording's ``artist-credit`` into a display name.
    MusicBrainz splits collaborations into credited parts joined by phrases
    (e.g. ``"A"`` + ``" & "`` + ``"B"``); stitch them back together.
    """
    credits = rec.get("artist-credit") or []
    parts: list[str] = []
    for c in credits:
        if isinstance(c, str):
            parts.append(c)
            continue
        name = c.get("name") or (c.get("artist") or {}).get("name") or ""
        parts.append(name)
        parts.append(c.get("joinphrase") or "")
    name = "".join(parts).strip()
    return name or None
--- a/ingest/radieo/fetchers/subsonic.py
+++ b/ingest/radieo/fetchers/subsonic.py
@ -25,12 +25,12 @@ class SubsonicFetcher:
        self._client = client
        self._cache_dir = cache_dir
-    def fetch(self, track: Track) -> Path:
+    def fetch(self, track: Track) -> tuple[Path, Track]:
        stem = f"subsonic-{_SAFE.sub('_', track.locator)}"
        # Reuse a still-cached copy rather than downloading again.
        for existing in self._cache_dir.glob(f"{stem}.*"):
            if not existing.name.endswith(".part"):
-                return existing
+                return existing, track
        tmp = self._cache_dir / f".{uuid4().hex}.part"
        try:
@ -43,4 +43,5 @@ class SubsonicFetcher:
            tmp.unlink(missing_ok=True)
            raise
        log.info("downloaded %s -> %s", track, dest.name)
-        return dest
+        # Subsonic files are already tagged by the library server; pass through.
        return dest, track
--- a/ingest/radieo/fetchers/ytdlp.py
+++ b/ingest/radieo/fetchers/ytdlp.py
@ -12,27 +12,48 @@ Liquidsoap decodes it via ffmpeg, so no re-encoding is needed.
 import hashlib
 import logging
 from dataclasses import replace
 from pathlib import Path
 from uuid import uuid4
 from .. import tagging
 from ..models import Track
 log = logging.getLogger("radieo.fetcher.ytdlp")
 def cache_stem(locator: str) -> str:
    """Cache filename stem for a yt-dlp locator (shared with the provider)."""
    h = hashlib.sha1(locator.encode()).hexdigest()[:16]
    return f"ytdlp-{h}"
 def cached_file(cache_dir: Path, locator: str) -> Path | None:
    """The ready (non-.part) cached file for a locator, if one exists."""
    for existing in cache_dir.glob(f"{cache_stem(locator)}.*"):
        if not existing.name.endswith(".part"):
            return existing
    return None
 class YtdlpFetcher:
    backend = "ytdlp"
-    def __init__(self, cache_dir: Path):
+    def __init__(self, cache_dir: Path, canonicalizer):
        self._cache_dir = cache_dir
        # Cross-checks guessed bandcamp tags against MusicBrainz. A
        # _NullCanonicalizer stands in when the lookup is disabled, so this is
        # always safe to call.
        self._canonicalizer = canonicalizer
-    def fetch(self, track: Track) -> Path:
+    def fetch(self, track: Track) -> tuple[Path, Track]:
-        h = hashlib.sha1(track.locator.encode()).hexdigest()[:16]
+        stem = cache_stem(track.locator)
-        stem = f"ytdlp-{h}"
+        # Reuse a still-cached copy rather than downloading again. The provider
-        # Reuse a still-cached copy rather than downloading again.
+        # already read this file's tags into the Track (see YtdlpProvider), so
-        for existing in self._cache_dir.glob(f"{stem}.*"):
+        # its metadata is the corrected one.
-            if not existing.name.endswith(".part"):
+        existing = cached_file(self._cache_dir, track.locator)
-                return existing
+        if existing is not None:
            return existing, track
        tmp_base = self._cache_dir / f".{stem}.{uuid4().hex}"
        opts = {
@ -58,7 +79,56 @@ class YtdlpFetcher:
                leftover.unlink(missing_ok=True)
            raise
        log.info("downloaded %s -> %s", track, dest.name)
-        return dest
+        return dest, self._retag(dest, info, track)
    def _retag(self, dest: Path, info: dict, track: Track) -> Track:
        """For bandcamp downloads, ensure sane ID3 tags and refine the Track.
        Bandcamp label accounts leave the flat-extracted artist pointing at the
        label and many uploads ship untagged. Three cases, in order:
        - the file is *already* tagged (bandcamp did it right): keep those tags,
          just adopt them into the Track so the stream annotation matches;
        - otherwise search MusicBrainz from the download metadata/filename and,
          on a confident match, write its canonical artist/title;
        - failing that, fall back to parsing the filename-style title.
        The tags land in the file, which is the single source of truth: the
        provider reads them back on the next pick (before the scheduler keys the
        track), and a cache-reuse hit inherits them via that same Track. Any
        failure leaves the Track untouched.
        """
        if not tagging.is_bandcamp(info, track.locator):
            return track
        try:
            existing = tagging.read_tags(dest)
            if existing is not None:
                log.info("keeping bandcamp tags on %s: %s — %s",
                         dest.name, existing.artist, existing.title)
                meta = existing
            else:
                guess = tagging.guess_metadata(info, track.source_url)
                mbid, mb_artist, mb_title = self._canonicalizer.identify(
                    guess.artist, guess.title
                )
                if mb_artist and mb_title:
                    meta = tagging.TagMeta(mb_artist, mb_title, guess.album, mbid)
                else:  # MusicBrainz had nothing: trust the filename
                    meta = guess
                tagging.write_tags(dest, meta)
                log.info(
                    "tagged %s as %s — %s%s",
                    dest.name,
                    meta.artist,
                    meta.title,
                    f" [{meta.album}]" if meta.album else "",
                )
        except Exception:
            log.exception("tagging failed for %s", dest.name)
            return track
        return replace(
            track, artist=meta.artist, title=meta.title, mbid=meta.mbid or track.mbid
        )
    def _downloaded_path(self, info: dict, tmp_base: Path) -> Path:
        reqs = info.get("requested_downloads")
--- a/ingest/radieo/providers/ytdlp.py
+++ b/ingest/radieo/providers/ytdlp.py
@ -17,8 +17,9 @@ import random
 import time
 from pathlib import Path
-from .. import config
+from .. import config, tagging
 from ..db import Database
 from ..fetchers.ytdlp import cached_file
 from ..models import Track
 log = logging.getLogger("radieo.provider.ytdlp")
@ -27,9 +28,10 @@ log = logging.getLogger("radieo.provider.ytdlp")
 class YtdlpProvider:
    name = "ytdlp"
-    def __init__(self, urls_file: Path, db: Database):
+    def __init__(self, urls_file: Path, db: Database, cache_dir: Path):
        self._urls_file = urls_file
        self._db = db
        self._cache_dir = cache_dir
        self._urls: list[str] = []
        self._loaded_at = 0.0
        # source URL -> (entries, loaded_at); entries are normalized dicts.
@ -83,15 +85,32 @@ class YtdlpProvider:
            return None
        pool = [e for e in entries if e["url"] not in recent] or entries
        entry = random.choice(pool)
        locator = entry["url"]
        meta = self._resolved_tags(locator)
        return Track(
            backend="ytdlp",
-            locator=entry["url"],
+            locator=locator,
-            artist=entry.get("artist") or "Unknown artist",
+            artist=(meta.artist if meta else entry.get("artist")) or "Unknown artist",
-            title=entry.get("title") or entry["url"],
+            title=(meta.title if meta else entry.get("title")) or locator,
            origin=self.name,
-            source_url=src_url if src_url != entry["url"] else None,
+            mbid=meta.mbid if meta else None,
            source_url=src_url if src_url != locator else None,
        )
    def _resolved_tags(self, locator: str):
        """Corrected identity a previous download wrote into the cached file.
        The yt-dlp fetcher fixes bandcamp label accounts and writes the real
        artist/title/MBID into the file's tags. Reading them back here — before
        the scheduler keys and anti-repeat-checks the track — makes selection,
        history and cache reuse agree on one identity. Returns None when the
        file isn't cached (yet) or carries no usable tags.
        """
        if not tagging.is_bandcamp({}, locator):
            return None
        cached = cached_file(self._cache_dir, locator)
        return tagging.read_tags(cached) if cached else None
    def _entries_for(self, src_url: str) -> list[dict]:
        now = time.time()
        cached = self._expanded.get(src_url)
--- a/ingest/radieo/queue.py
+++ b/ingest/radieo/queue.py
@ -64,7 +64,9 @@ class TrackQueue:
                log.error("no fetcher for backend %r (%s)", track.backend, track)
                continue
            try:
-                path = fetcher.fetch(track)
+                # Fetchers may refine the Track's metadata (e.g. correcting a
                # bandcamp label account to the real artist), so take it back.
                path, track = fetcher.fetch(track)
            except Exception:
                log.exception("fetch failed for %s", track)
                continue
--- a/ingest/radieo/tagging.py
+++ b/ingest/radieo/tagging.py
@ -0,0 +1,165 @@
 """Guess and write ID3 tags for freshly downloaded bandcamp tracks.
 The yt-dlp *flat* extraction the provider uses to pick a track is thin: on a
 bandcamp **label** account the uploader is the label rather than the performing
 artist, and many bandcamp mp3s ship with no ID3 tags at all. The full metadata
 yt-dlp gathers when it actually downloads is richer, so this module re-derives
 ``(artist, title, album)`` from it — trusting the ``"Artist - Title"`` shape of
 the track title the way the user's ``id3tag`` one-liner trusts the filename —
 and writes the result into the file's tags.
 Everything here is best-effort: the artist is *guessed*, then the caller cross-
 checks it against MusicBrainz (:meth:`Canonicalizer.identify`) to adopt a
 canonical spelling and MBID when one is found confidently. A failure to open or
 tag the file just leaves it untagged; playback is unaffected.
 """
 import logging
 import re
 from dataclasses import dataclass
 from pathlib import Path
 log = logging.getLogger("radieo.tagging")
@dataclass
 class TagMeta:
    artist: str
    title: str
    album: str | None = None
    mbid: str | None = None
 def is_bandcamp(info: dict, locator: str) -> bool:
    """True when the download came from bandcamp (extractor key or host)."""
    key = (info.get("extractor_key") or info.get("extractor") or "").lower()
    if "bandcamp" in key:
        return True
    return "bandcamp.com" in (locator or "")
 def guess_metadata(info: dict, source_url: str | None = None) -> TagMeta:
    """Derive tags from a yt-dlp *download* info dict.
    Priority mirrors the user's shell heuristic: the track title usually carries
    ``"Artist - Title"`` (split on the *last* ``" - "``), which is more reliable
    than the uploader on label accounts. When the title has no separator we fall
    back to the explicit ``artist`` field, preferring it over the uploader/label.
    """
    raw_title = (info.get("title") or "").strip()
    artist, title = _split_artist_title(raw_title)
    if not artist:
        artist = (
            info.get("artist")
            or info.get("creator")
            or info.get("uploader")
            or info.get("channel")
            or ""
        ).strip()
    if not title:
        title = raw_title or (info.get("track") or "").strip()
    album = (info.get("album") or "").strip() or _album_from_url(
        source_url or info.get("webpage_url")
    )
    return TagMeta(
        artist=artist or "Unknown artist",
        title=title or "Unknown title",
        album=album or None,
    )
 def read_tags(path: Path) -> TagMeta | None:
    """Return the file's existing tags, or None when it has no usable ones.
    "Usable" means both an artist and a title are present — that is the signal
    that bandcamp already tagged the file properly, in which case we leave it
    alone rather than second-guessing it with MusicBrainz.
    """
    from mutagen import File as MutagenFile
    try:
        audio = MutagenFile(str(path), easy=True)
    except Exception:
        return None
    if audio is None or not audio.tags:
        return None
    artist = _first(audio, "artist")
    title = _first(audio, "title")
    if not (artist and title):
        return None
    return TagMeta(
        artist=artist,
        title=title,
        album=_first(audio, "album") or None,
        mbid=_first(audio, "musicbrainz_trackid") or None,
    )
 def write_tags(path: Path, meta: TagMeta) -> bool:
    """Write ``meta`` into the file's tags, overwriting existing ones.
    Returns True on success. Any failure (unsupported container, unwritable
    file) is logged and swallowed — tagging never breaks a download.
    """
    from mutagen import File as MutagenFile
    try:
        audio = MutagenFile(str(path), easy=True)
    except Exception as exc:  # corrupt/unknown file: leave it alone
        log.warning("cannot open %s for tagging: %s", Path(path).name, exc)
        return False
    if audio is None:
        log.info("no mutagen handler for %s; skipping tags", Path(path).name)
        return False
    try:
        if audio.tags is None:
            audio.add_tags()
        audio["artist"] = meta.artist
        audio["title"] = meta.title
        if meta.album:
            audio["album"] = meta.album
        if meta.mbid:
            try:
                audio["musicbrainz_trackid"] = meta.mbid
            except (KeyError, ValueError):
                pass  # container's easy interface doesn't map this key
        audio.save()
        return True
    except Exception as exc:
        log.warning("could not write tags to %s: %s", Path(path).name, exc)
        return False
 # --- helpers --------------------------------------------------------------
 def _first(audio, key: str) -> str:
    """First value of an easy-tag key, stripped; "" when absent."""
    values = audio.tags.get(key) or []
    return values[0].strip() if values and values[0] else ""
 def _split_artist_title(text: str) -> tuple[str | None, str]:
    """Split ``"Artist - Title"`` on the last ``" - "`` (like the shell one-liner).
    Returns ``(None, text)`` when there is no separator to split on.
    """
    idx = text.rfind(" - ")
    if idx == -1:
        return None, text.strip()
    return text[:idx].strip() or None, text[idx + 3 :].strip()
 def _album_from_url(url: str | None) -> str | None:
    """Deslugify a bandcamp ``/album/<slug>`` path into a readable album name.
    Only a fallback for when the info dict carries no album; ``/track/`` URLs
    yield nothing.
    """
    if not url:
        return None
    m = re.search(r"/album/([^/?#]+)", url)
    if not m:
        return None
    return m.group(1).replace("-", " ").strip().title() or None
--- a/ingest/requirements.txt
+++ b/ingest/requirements.txt
@ -1,2 +1,3 @@
 httpx>=0.27
 yt-dlp>=2024.1
 mutagen>=1.47