ingest: resolve bandcamp label sources down to individual tracks

A yt-dlp source pointing at a label/artist page flat-extracts to a mix of /track/ and /album/ URLs. The provider used each verbatim as a locator, so an /album/ URL was handed to the fetcher as if it were a track: yt-dlp then (mis)downloaded the whole album into one file and tagged it from the playlist-level info, which carries the album title and no artist — surfacing as "Unknown artist" on the stream. Drill picked container entries down to a single track before emitting a locator, bounded by a small depth so nested containers (label -> album -> track) resolve while a real track (which flat-extracts to just itself) is the base case. Locators are now always downloadable tracks, so the existing tag/fetch path and anti-repeat keying work as intended. Also make guess_metadata trust the explicit artist tag over the "Artist - Title" title split: some label uploads double the artist into the title ("Artist - Artist - Title"), which the blind last-" - " split mis-parsed. When that artist prefixes the title we peel it off (repeatedly), falling back to the split only when there is no artist tag. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-07-03 12:48:45 +08:00 · 2026-07-03 12:48:45 +08:00 · 032a9b86b8
commit 032a9b86b8
parent a65cc61ccd
2 changed files with 72 additions and 20 deletions
--- a/ingest/radieo/providers/ytdlp.py
+++ b/ingest/radieo/providers/ytdlp.py
@ -28,6 +28,11 @@ log = logging.getLogger("radieo.provider.ytdlp")
 class YtdlpProvider:
    name = "ytdlp"

+    # How many container levels (label -> album -> track) to expand while
+    # drilling down to a single track. One flat extraction of a label page is
+    # not enough: its /album/ entries are themselves containers.
+    _MAX_EXPAND_DEPTH = 3
+
    def __init__(self, urls_file: Path, db: Database, cache_dir: Path):
        self._urls_file = urls_file
        self._db = db
@ -77,26 +82,66 @@ class YtdlpProvider:

    def _resolve(self, src_url: str, recent: set[str]) -> Track | None:
        try:
-            entries = self._entries_for(src_url)
+            entry = self._pick_entry(src_url, recent, 0)
        except Exception as exc:  # yt-dlp raises many extractor-specific errors
            log.warning("yt-dlp could not resolve %s: %s", src_url, exc)
            return None
-        if not entries:
+        if entry is None:
            return None
-        pool = [e for e in entries if e["url"] not in recent] or entries
-        entry = random.choice(pool)
        locator = entry["url"]
        meta = self._resolved_tags(locator)
+        if meta is not None:  # a previous download already wrote real tags
+            artist, title, mbid = meta.artist, meta.title, meta.mbid
+        else:
+            # Split the entry's "Artist - Title" the same way the fetcher will
+            # after downloading, so the scheduler's canonical anti-repeat keys
+            # this provisional track the same before and after the download.
+            guess = tagging.guess_metadata(
+                {"title": entry.get("title"), "artist": entry.get("artist")}
+            )
+            artist = guess.artist
+            title = guess.title if entry.get("title") else locator
+            mbid = None
        return Track(
            backend="ytdlp",
            locator=locator,
-            artist=(meta.artist if meta else entry.get("artist")) or "Unknown artist",
-            title=(meta.title if meta else entry.get("title")) or locator,
+            artist=artist,
+            title=title,
            origin=self.name,
-            mbid=meta.mbid if meta else None,
+            mbid=mbid,
            source_url=src_url if src_url != locator else None,
        )

+    def _pick_entry(self, src_url: str, recent: set[str], depth: int) -> dict | None:
+        """Pick one *track* entry under ``src_url``, drilling through containers.
+
+        A bandcamp label/artist page flat-extracts to a mix of ``/track/`` and
+        ``/album/`` URLs, and an ``/album/`` URL is itself a container. If such a
+        URL were emitted as a locator, the fetcher would (mis)download the whole
+        album into a single file and tag it with album-level, artist-less
+        metadata — the "Unknown artist" bug. So when the picked entry is itself a
+        container we recurse into it, bounded by ``_MAX_EXPAND_DEPTH``, until the
+        locator is a single downloadable track.
+
+        A real track flat-extracts to just itself, which is the recursion's base
+        case. That leaf extraction is a full (non-flat) metadata fetch, so the
+        entry carries the track's real ``title``/``artist``; it is cached, so the
+        cost is paid once per refresh window.
+        """
+        entries = self._entries_for(src_url)
+        if not entries:
+            return None
+        if len(entries) == 1 and entries[0]["url"] == src_url:
+            return entries[0]  # leaf: a track resolves to itself
+        pool = [e for e in entries if e["url"] not in recent] or entries
+        entry = random.choice(pool)
+        url = entry["url"]
+        if url == src_url or depth >= self._MAX_EXPAND_DEPTH:
+            return entry
+        # The picked entry may itself be a container (an album); drill into it.
+        # Fall back to the entry as-is if that turns up nothing usable.
+        return self._pick_entry(url, recent, depth + 1) or entry
+
    def _resolved_tags(self, locator: str):
        """Corrected identity a previous download wrote into the cached file.

--- a/ingest/radieo/tagging.py
+++ b/ingest/radieo/tagging.py
@ -41,21 +41,28 @@ def is_bandcamp(info: dict, locator: str) -> bool:
 def guess_metadata(info: dict, source_url: str | None = None) -> TagMeta:
    """Derive tags from a yt-dlp *download* info dict.

-    Priority mirrors the user's shell heuristic: the track title usually carries
-    ``"Artist - Title"`` (split on the *last* ``" - "``), which is more reliable
-    than the uploader on label accounts. When the title has no separator we fall
-    back to the explicit ``artist`` field, preferring it over the uploader/label.
+    On a bandcamp download the reliable signal is the explicit ``artist`` tag
+    (the performing artist, *not* the label the way ``uploader`` is). Label
+    uploads name the ``title`` ``"Artist - Title"`` — sometimes with the artist
+    doubled (``"Artist - Artist - Title"``) — so when that artist tag prefixes
+    the title we peel it off rather than guessing where to split. Only when
+    there is no such artist tag do we fall back to splitting the title on the
+    last ``" - "`` (and then to the uploader/label as a last resort).
    """
    raw_title = (info.get("title") or "").strip()
-    artist, title = _split_artist_title(raw_title)
-    if not artist:
-        artist = (
-            info.get("artist")
-            or info.get("creator")
-            or info.get("uploader")
-            or info.get("channel")
-            or ""
-        ).strip()
+    reliable = (info.get("artist") or info.get("creator") or "").strip()
+    prefix = f"{reliable} - "
+    if reliable and raw_title.startswith(prefix):
+        artist = reliable
+        title = raw_title
+        while title.startswith(prefix):  # peel a doubled "Artist - " prefix too
+            title = title[len(prefix):].strip()
+    else:
+        artist, title = _split_artist_title(raw_title)
+        if not artist:
+            artist = reliable or (
+                info.get("uploader") or info.get("channel") or ""
+            ).strip()
    if not title:
        title = raw_title or (info.get("track") or "").strip()