ingest: tag bandcamp downloads and correct label-account artists

yt-dlp's flat extraction mis-attributes bandcamp uploads: on a label account the artist becomes the label, and many files ship untagged. After a download, guess (artist, title, album) from the richer download metadata and the "Artist - Title" filename shape, confirm against MusicBrainz, and write ID3 tags into the file. Existing tags are respected; MusicBrainz is primary, filename parsing only a fallback. The file's tags are the single source of truth: the provider reads them back on the next pick, so the corrected identity (incl. MBID) is in place before the scheduler keys and anti-repeat-checks the track, and cache reuse inherits it too. Fetchers now return (path, Track) so the corrected metadata flows back. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-07-03 11:01:24 +08:00 · 2026-07-03 11:01:24 +08:00 · fbdb2d6bb3
commit fbdb2d6bb3
parent aad3c9d0f7
8 changed files with 346 additions and 36 deletions
--- a/ingest/radieo/main.py
+++ b/ingest/radieo/main.py
@ -19,14 +19,18 @@ class _NullCanonicalizer:
    def canonicalize(self, track):
        return track

+    def identify(self, artist, title):
+        return None, None, None
+
    def close(self):
        pass


-def _build_pipeline(db: Database):
+def _build_pipeline(db: Database, canonicalizer):
    """Return (providers, fetchers). Assembles whichever sources are enabled;
    when none is, the scheduler yields nothing and the stream plays its local
-    cache fallback."""
+    cache fallback. ``canonicalizer`` is handed to the yt-dlp fetcher so it can
+    confirm guessed bandcamp tags against MusicBrainz."""
    providers = []  # list[(provider, weight)]
    fetchers = {}  # backend name -> fetcher
    subsonic_client = None  # reused by ListenBrainz for resolution
@ -54,9 +58,9 @@ def _build_pipeline(db: Database):
        from .fetchers.ytdlp import YtdlpFetcher
        from .providers.ytdlp import YtdlpProvider

-        provider = YtdlpProvider(config.YTDLP_URLS_FILE, db)
+        provider = YtdlpProvider(config.YTDLP_URLS_FILE, db, config.CACHE_DIR)
        providers.append((provider, config.SOURCE_WEIGHTS["ytdlp"]))
-        fetchers["ytdlp"] = YtdlpFetcher(config.CACHE_DIR)
+        fetchers["ytdlp"] = YtdlpFetcher(config.CACHE_DIR, canonicalizer)
        log.info(
            "yt-dlp source enabled (urls=%s, weight=%d)",
            config.YTDLP_URLS_FILE,
@ -78,7 +82,7 @@ def _build_pipeline(db: Database):
        if "ytdlp" not in fetchers:
            from .fetchers.ytdlp import YtdlpFetcher

-            fetchers["ytdlp"] = YtdlpFetcher(config.CACHE_DIR)
+            fetchers["ytdlp"] = YtdlpFetcher(config.CACHE_DIR, canonicalizer)
        provider = ListenBrainzProvider(
            config.LISTENBRAINZ_URL,
            db,
@ -148,7 +152,7 @@ def main() -> None:
        canonicalizer = _NullCanonicalizer()
        log.info("Canonicalizer disabled: tracks keyed by (artist, title).")

-    providers, fetchers = _build_pipeline(db)
+    providers, fetchers = _build_pipeline(db, canonicalizer)
    scheduler = Scheduler(providers, canonicalizer, db)
    queue = TrackQueue(scheduler, fetchers, db)
    queue.start()
--- a/ingest/radieo/canonicalizer.py
+++ b/ingest/radieo/canonicalizer.py
@ -49,15 +49,49 @@ class Canonicalizer:

        cached, mbid = self._db.get_canonical(artist_norm, title_norm)
        if not cached:
-            ok, mbid = self._lookup(track.artist, track.title)
+            ok, rec = self._search(track.artist, track.title)
            if ok:  # cache hits and genuine misses; skip transient errors
+                mbid = rec.get("id") if rec else None
+                log.info(
+                    "%s for %s — %s",
+                    f"MBID {mbid}" if mbid else "no confident MBID",
+                    track.artist,
+                    track.title,
+                )
                self._db.put_canonical(artist_norm, title_norm, mbid)
        return replace(track, mbid=mbid) if mbid else track

+    def identify(
+        self, artist: str, title: str
+    ) -> tuple[str | None, str | None, str | None]:
+        """Best-effort MusicBrainz identification for tagging.
+
+        Returns ``(mbid, canonical_artist, canonical_title)``; any field may be
+        None. Unlike :meth:`canonicalize` it also hands back the recording's
+        canonical names so a caller can clean up a mis-tagged file. Warms the
+        MBID cache as a side effect but is not itself cached, so call it at most
+        once per download.
+        """
+        artist_norm = norm_name(artist)
+        title_norm = norm_name(title)
+        if not artist_norm or not title_norm:
+            return None, None, None
+        ok, rec = self._search(artist, title)
+        if not ok or not rec:
+            return None, None, None
+        mbid = rec.get("id")
+        if mbid:
+            self._db.put_canonical(artist_norm, title_norm, mbid)
+        return mbid, _artist_credit(rec), rec.get("title")
+
    # --- MusicBrainz ------------------------------------------------------

-    def _lookup(self, artist: str, title: str) -> tuple[bool, str | None]:
-        """Return (ok, mbid). ``ok`` False on a transient error (do not cache)."""
+    def _search(self, artist: str, title: str) -> tuple[bool, dict | None]:
+        """Query MusicBrainz for the best recording match above the threshold.
+
+        Returns ``(ok, recording)``; ``ok`` False on a transient error (caller
+        must not cache), ``recording`` None on a genuine no-confident-match.
+        """
        query = f'artist:"{_escape(artist)}" AND recording:"{_escape(title)}"'
        self._throttle()
        try:
@ -72,14 +106,9 @@ class Canonicalizer:
            return False, None

        recordings = data.get("recordings") or []
-        if recordings:
-            best = recordings[0]
-            if int(best.get("score", 0)) >= config.CANONICAL_MIN_SCORE:
-                mbid = best.get("id")
-                log.info("MBID %s for %s — %s", mbid, artist, title)
-                return True, mbid
-        log.info("no confident MBID for %s — %s", artist, title)
-        return True, None  # genuine miss: cache it
+        if recordings and int(recordings[0].get("score", 0)) >= config.CANONICAL_MIN_SCORE:
+            return True, recordings[0]
+        return True, None

    def _throttle(self) -> None:
        with self._rate_lock:
@ -96,3 +125,22 @@ class Canonicalizer:
 # quoted phrase.
 def _escape(value: str) -> str:
    return value.replace("\\", "\\\\").replace('"', '\\"')
+
+
+def _artist_credit(rec: dict) -> str | None:
+    """Flatten a recording's ``artist-credit`` into a display name.
+
+    MusicBrainz splits collaborations into credited parts joined by phrases
+    (e.g. ``"A"`` + ``" & "`` + ``"B"``); stitch them back together.
+    """
+    credits = rec.get("artist-credit") or []
+    parts: list[str] = []
+    for c in credits:
+        if isinstance(c, str):
+            parts.append(c)
+            continue
+        name = c.get("name") or (c.get("artist") or {}).get("name") or ""
+        parts.append(name)
+        parts.append(c.get("joinphrase") or "")
+    name = "".join(parts).strip()
+    return name or None
--- a/ingest/radieo/fetchers/subsonic.py
+++ b/ingest/radieo/fetchers/subsonic.py
@ -25,12 +25,12 @@ class SubsonicFetcher:
        self._client = client
        self._cache_dir = cache_dir

-    def fetch(self, track: Track) -> Path:
+    def fetch(self, track: Track) -> tuple[Path, Track]:
        stem = f"subsonic-{_SAFE.sub('_', track.locator)}"
        # Reuse a still-cached copy rather than downloading again.
        for existing in self._cache_dir.glob(f"{stem}.*"):
            if not existing.name.endswith(".part"):
-                return existing
+                return existing, track

        tmp = self._cache_dir / f".{uuid4().hex}.part"
        try:
@ -43,4 +43,5 @@ class SubsonicFetcher:
            tmp.unlink(missing_ok=True)
            raise
        log.info("downloaded %s -> %s", track, dest.name)
-        return dest
+        # Subsonic files are already tagged by the library server; pass through.
+        return dest, track
--- a/ingest/radieo/fetchers/ytdlp.py
+++ b/ingest/radieo/fetchers/ytdlp.py
@ -12,27 +12,48 @@ Liquidsoap decodes it via ffmpeg, so no re-encoding is needed.

 import hashlib
 import logging
+from dataclasses import replace
 from pathlib import Path
 from uuid import uuid4

+from .. import tagging
 from ..models import Track

 log = logging.getLogger("radieo.fetcher.ytdlp")


+def cache_stem(locator: str) -> str:
+    """Cache filename stem for a yt-dlp locator (shared with the provider)."""
+    h = hashlib.sha1(locator.encode()).hexdigest()[:16]
+    return f"ytdlp-{h}"
+
+
+def cached_file(cache_dir: Path, locator: str) -> Path | None:
+    """The ready (non-.part) cached file for a locator, if one exists."""
+    for existing in cache_dir.glob(f"{cache_stem(locator)}.*"):
+        if not existing.name.endswith(".part"):
+            return existing
+    return None
+
+
 class YtdlpFetcher:
    backend = "ytdlp"

-    def __init__(self, cache_dir: Path):
+    def __init__(self, cache_dir: Path, canonicalizer):
        self._cache_dir = cache_dir
+        # Cross-checks guessed bandcamp tags against MusicBrainz. A
+        # _NullCanonicalizer stands in when the lookup is disabled, so this is
+        # always safe to call.
+        self._canonicalizer = canonicalizer

-    def fetch(self, track: Track) -> Path:
-        h = hashlib.sha1(track.locator.encode()).hexdigest()[:16]
-        stem = f"ytdlp-{h}"
-        # Reuse a still-cached copy rather than downloading again.
-        for existing in self._cache_dir.glob(f"{stem}.*"):
-            if not existing.name.endswith(".part"):
-                return existing
+    def fetch(self, track: Track) -> tuple[Path, Track]:
+        stem = cache_stem(track.locator)
+        # Reuse a still-cached copy rather than downloading again. The provider
+        # already read this file's tags into the Track (see YtdlpProvider), so
+        # its metadata is the corrected one.
+        existing = cached_file(self._cache_dir, track.locator)
+        if existing is not None:
+            return existing, track

        tmp_base = self._cache_dir / f".{stem}.{uuid4().hex}"
        opts = {
@ -58,7 +79,56 @@ class YtdlpFetcher:
                leftover.unlink(missing_ok=True)
            raise
        log.info("downloaded %s -> %s", track, dest.name)
-        return dest
+        return dest, self._retag(dest, info, track)
+
+    def _retag(self, dest: Path, info: dict, track: Track) -> Track:
+        """For bandcamp downloads, ensure sane ID3 tags and refine the Track.
+
+        Bandcamp label accounts leave the flat-extracted artist pointing at the
+        label and many uploads ship untagged. Three cases, in order:
+
+        - the file is *already* tagged (bandcamp did it right): keep those tags,
+          just adopt them into the Track so the stream annotation matches;
+        - otherwise search MusicBrainz from the download metadata/filename and,
+          on a confident match, write its canonical artist/title;
+        - failing that, fall back to parsing the filename-style title.
+
+        The tags land in the file, which is the single source of truth: the
+        provider reads them back on the next pick (before the scheduler keys the
+        track), and a cache-reuse hit inherits them via that same Track. Any
+        failure leaves the Track untouched.
+        """
+        if not tagging.is_bandcamp(info, track.locator):
+            return track
+        try:
+            existing = tagging.read_tags(dest)
+            if existing is not None:
+                log.info("keeping bandcamp tags on %s: %s — %s",
+                         dest.name, existing.artist, existing.title)
+                meta = existing
+            else:
+                guess = tagging.guess_metadata(info, track.source_url)
+                mbid, mb_artist, mb_title = self._canonicalizer.identify(
+                    guess.artist, guess.title
+                )
+                if mb_artist and mb_title:
+                    meta = tagging.TagMeta(mb_artist, mb_title, guess.album, mbid)
+                else:  # MusicBrainz had nothing: trust the filename
+                    meta = guess
+                tagging.write_tags(dest, meta)
+                log.info(
+                    "tagged %s as %s — %s%s",
+                    dest.name,
+                    meta.artist,
+                    meta.title,
+                    f" [{meta.album}]" if meta.album else "",
+                )
+        except Exception:
+            log.exception("tagging failed for %s", dest.name)
+            return track
+        return replace(
+            track, artist=meta.artist, title=meta.title, mbid=meta.mbid or track.mbid
+        )

    def _downloaded_path(self, info: dict, tmp_base: Path) -> Path:
        reqs = info.get("requested_downloads")
--- a/ingest/radieo/providers/ytdlp.py
+++ b/ingest/radieo/providers/ytdlp.py
@ -17,8 +17,9 @@ import random
 import time
 from pathlib import Path

-from .. import config
+from .. import config, tagging
 from ..db import Database
+from ..fetchers.ytdlp import cached_file
 from ..models import Track

 log = logging.getLogger("radieo.provider.ytdlp")
@ -27,9 +28,10 @@ log = logging.getLogger("radieo.provider.ytdlp")
 class YtdlpProvider:
    name = "ytdlp"

-    def __init__(self, urls_file: Path, db: Database):
+    def __init__(self, urls_file: Path, db: Database, cache_dir: Path):
        self._urls_file = urls_file
        self._db = db
+        self._cache_dir = cache_dir
        self._urls: list[str] = []
        self._loaded_at = 0.0
        # source URL -> (entries, loaded_at); entries are normalized dicts.
@ -83,15 +85,32 @@ class YtdlpProvider:
            return None
        pool = [e for e in entries if e["url"] not in recent] or entries
        entry = random.choice(pool)
+        locator = entry["url"]
+        meta = self._resolved_tags(locator)
        return Track(
            backend="ytdlp",
-            locator=entry["url"],
-            artist=entry.get("artist") or "Unknown artist",
-            title=entry.get("title") or entry["url"],
+            locator=locator,
+            artist=(meta.artist if meta else entry.get("artist")) or "Unknown artist",
+            title=(meta.title if meta else entry.get("title")) or locator,
            origin=self.name,
-            source_url=src_url if src_url != entry["url"] else None,
+            mbid=meta.mbid if meta else None,
+            source_url=src_url if src_url != locator else None,
        )

+    def _resolved_tags(self, locator: str):
+        """Corrected identity a previous download wrote into the cached file.
+
+        The yt-dlp fetcher fixes bandcamp label accounts and writes the real
+        artist/title/MBID into the file's tags. Reading them back here — before
+        the scheduler keys and anti-repeat-checks the track — makes selection,
+        history and cache reuse agree on one identity. Returns None when the
+        file isn't cached (yet) or carries no usable tags.
+        """
+        if not tagging.is_bandcamp({}, locator):
+            return None
+        cached = cached_file(self._cache_dir, locator)
+        return tagging.read_tags(cached) if cached else None
+
    def _entries_for(self, src_url: str) -> list[dict]:
        now = time.time()
        cached = self._expanded.get(src_url)
--- a/ingest/radieo/queue.py
+++ b/ingest/radieo/queue.py
@ -64,7 +64,9 @@ class TrackQueue:
                log.error("no fetcher for backend %r (%s)", track.backend, track)
                continue
            try:
-                path = fetcher.fetch(track)
+                # Fetchers may refine the Track's metadata (e.g. correcting a
+                # bandcamp label account to the real artist), so take it back.
+                path, track = fetcher.fetch(track)
            except Exception:
                log.exception("fetch failed for %s", track)
                continue
--- a/ingest/radieo/tagging.py
+++ b/ingest/radieo/tagging.py
@ -0,0 +1,165 @@
+"""Guess and write ID3 tags for freshly downloaded bandcamp tracks.
+
+The yt-dlp *flat* extraction the provider uses to pick a track is thin: on a
+bandcamp **label** account the uploader is the label rather than the performing
+artist, and many bandcamp mp3s ship with no ID3 tags at all. The full metadata
+yt-dlp gathers when it actually downloads is richer, so this module re-derives
+``(artist, title, album)`` from it — trusting the ``"Artist - Title"`` shape of
+the track title the way the user's ``id3tag`` one-liner trusts the filename —
+and writes the result into the file's tags.
+
+Everything here is best-effort: the artist is *guessed*, then the caller cross-
+checks it against MusicBrainz (:meth:`Canonicalizer.identify`) to adopt a
+canonical spelling and MBID when one is found confidently. A failure to open or
+tag the file just leaves it untagged; playback is unaffected.
+"""
+
+import logging
+import re
+from dataclasses import dataclass
+from pathlib import Path
+
+log = logging.getLogger("radieo.tagging")
+
+
+@dataclass
+class TagMeta:
+    artist: str
+    title: str
+    album: str | None = None
+    mbid: str | None = None
+
+
+def is_bandcamp(info: dict, locator: str) -> bool:
+    """True when the download came from bandcamp (extractor key or host)."""
+    key = (info.get("extractor_key") or info.get("extractor") or "").lower()
+    if "bandcamp" in key:
+        return True
+    return "bandcamp.com" in (locator or "")
+
+
+def guess_metadata(info: dict, source_url: str | None = None) -> TagMeta:
+    """Derive tags from a yt-dlp *download* info dict.
+
+    Priority mirrors the user's shell heuristic: the track title usually carries
+    ``"Artist - Title"`` (split on the *last* ``" - "``), which is more reliable
+    than the uploader on label accounts. When the title has no separator we fall
+    back to the explicit ``artist`` field, preferring it over the uploader/label.
+    """
+    raw_title = (info.get("title") or "").strip()
+    artist, title = _split_artist_title(raw_title)
+    if not artist:
+        artist = (
+            info.get("artist")
+            or info.get("creator")
+            or info.get("uploader")
+            or info.get("channel")
+            or ""
+        ).strip()
+    if not title:
+        title = raw_title or (info.get("track") or "").strip()
+
+    album = (info.get("album") or "").strip() or _album_from_url(
+        source_url or info.get("webpage_url")
+    )
+    return TagMeta(
+        artist=artist or "Unknown artist",
+        title=title or "Unknown title",
+        album=album or None,
+    )
+
+
+def read_tags(path: Path) -> TagMeta | None:
+    """Return the file's existing tags, or None when it has no usable ones.
+
+    "Usable" means both an artist and a title are present — that is the signal
+    that bandcamp already tagged the file properly, in which case we leave it
+    alone rather than second-guessing it with MusicBrainz.
+    """
+    from mutagen import File as MutagenFile
+
+    try:
+        audio = MutagenFile(str(path), easy=True)
+    except Exception:
+        return None
+    if audio is None or not audio.tags:
+        return None
+    artist = _first(audio, "artist")
+    title = _first(audio, "title")
+    if not (artist and title):
+        return None
+    return TagMeta(
+        artist=artist,
+        title=title,
+        album=_first(audio, "album") or None,
+        mbid=_first(audio, "musicbrainz_trackid") or None,
+    )
+
+
+def write_tags(path: Path, meta: TagMeta) -> bool:
+    """Write ``meta`` into the file's tags, overwriting existing ones.
+
+    Returns True on success. Any failure (unsupported container, unwritable
+    file) is logged and swallowed — tagging never breaks a download.
+    """
+    from mutagen import File as MutagenFile
+
+    try:
+        audio = MutagenFile(str(path), easy=True)
+    except Exception as exc:  # corrupt/unknown file: leave it alone
+        log.warning("cannot open %s for tagging: %s", Path(path).name, exc)
+        return False
+    if audio is None:
+        log.info("no mutagen handler for %s; skipping tags", Path(path).name)
+        return False
+    try:
+        if audio.tags is None:
+            audio.add_tags()
+        audio["artist"] = meta.artist
+        audio["title"] = meta.title
+        if meta.album:
+            audio["album"] = meta.album
+        if meta.mbid:
+            try:
+                audio["musicbrainz_trackid"] = meta.mbid
+            except (KeyError, ValueError):
+                pass  # container's easy interface doesn't map this key
+        audio.save()
+        return True
+    except Exception as exc:
+        log.warning("could not write tags to %s: %s", Path(path).name, exc)
+        return False
+
+
+# --- helpers --------------------------------------------------------------
+
+
+def _first(audio, key: str) -> str:
+    """First value of an easy-tag key, stripped; "" when absent."""
+    values = audio.tags.get(key) or []
+    return values[0].strip() if values and values[0] else ""
+
+
+def _split_artist_title(text: str) -> tuple[str | None, str]:
+    """Split ``"Artist - Title"`` on the last ``" - "`` (like the shell one-liner).
+
+    Returns ``(None, text)`` when there is no separator to split on.
+    """
+    idx = text.rfind(" - ")
+    if idx == -1:
+        return None, text.strip()
+    return text[:idx].strip() or None, text[idx + 3 :].strip()
+
+
+def _album_from_url(url: str | None) -> str | None:
+    """Deslugify a bandcamp ``/album/<slug>`` path into a readable album name.
+
+    Only a fallback for when the info dict carries no album; ``/track/`` URLs
+    yield nothing.
+    """
+    if not url:
+        return None
+    m = re.search(r"/album/([^/?#]+)", url)
+    if not m:
+        return None
+    return m.group(1).replace("-", " ").strip().title() or None
--- a/ingest/requirements.txt
+++ b/ingest/requirements.txt
@ -1,2 +1,3 @@
 httpx>=0.27
 yt-dlp>=2024.1
+mutagen>=1.47