ingest: resolve bandcamp label sources down to individual tracks

A yt-dlp source pointing at a label/artist page flat-extracts to a mix of /track/ and /album/ URLs. The provider used each verbatim as a locator, so an /album/ URL was handed to the fetcher as if it were a track: yt-dlp then (mis)downloaded the whole album into one file and tagged it from the playlist-level info, which carries the album title and no artist — surfacing as "Unknown artist" on the stream. Drill picked container entries down to a single track before emitting a locator, bounded by a small depth so nested containers (label -> album -> track) resolve while a real track (which flat-extracts to just itself) is the base case. Locators are now always downloadable tracks, so the existing tag/fetch path and anti-repeat keying work as intended. Also make guess_metadata trust the explicit artist tag over the "Artist - Title" title split: some label uploads double the artist into the title ("Artist - Artist - Title"), which the blind last-" - " split mis-parsed. When that artist prefixes the title we peel it off (repeatedly), falling back to the split only when there is no artist tag. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-07-03 12:48:45 +08:00 · 2026-07-03 12:48:45 +08:00 · 032a9b86b8
commit 032a9b86b8
parent a65cc61ccd
2 changed files with 72 additions and 20 deletions
--- a/ingest/radieo/providers/ytdlp.py
+++ b/ingest/radieo/providers/ytdlp.py
@ -28,6 +28,11 @@ log = logging.getLogger("radieo.provider.ytdlp")
 class YtdlpProvider:
    name = "ytdlp"
    # How many container levels (label -> album -> track) to expand while
    # drilling down to a single track. One flat extraction of a label page is
    # not enough: its /album/ entries are themselves containers.
    _MAX_EXPAND_DEPTH = 3
    def __init__(self, urls_file: Path, db: Database, cache_dir: Path):
        self._urls_file = urls_file
        self._db = db
@ -77,26 +82,66 @@ class YtdlpProvider:
    def _resolve(self, src_url: str, recent: set[str]) -> Track | None:
        try:
-            entries = self._entries_for(src_url)
+            entry = self._pick_entry(src_url, recent, 0)
        except Exception as exc:  # yt-dlp raises many extractor-specific errors
            log.warning("yt-dlp could not resolve %s: %s", src_url, exc)
            return None
-        if not entries:
+        if entry is None:
            return None
        pool = [e for e in entries if e["url"] not in recent] or entries
        entry = random.choice(pool)
        locator = entry["url"]
        meta = self._resolved_tags(locator)
        if meta is not None:  # a previous download already wrote real tags
            artist, title, mbid = meta.artist, meta.title, meta.mbid
        else:
            # Split the entry's "Artist - Title" the same way the fetcher will
            # after downloading, so the scheduler's canonical anti-repeat keys
            # this provisional track the same before and after the download.
            guess = tagging.guess_metadata(
                {"title": entry.get("title"), "artist": entry.get("artist")}
            )
            artist = guess.artist
            title = guess.title if entry.get("title") else locator
            mbid = None
        return Track(
            backend="ytdlp",
            locator=locator,
-            artist=(meta.artist if meta else entry.get("artist")) or "Unknown artist",
+            artist=artist,
-            title=(meta.title if meta else entry.get("title")) or locator,
+            title=title,
            origin=self.name,
-            mbid=meta.mbid if meta else None,
+            mbid=mbid,
            source_url=src_url if src_url != locator else None,
        )
    def _pick_entry(self, src_url: str, recent: set[str], depth: int) -> dict | None:
        """Pick one *track* entry under ``src_url``, drilling through containers.
        A bandcamp label/artist page flat-extracts to a mix of ``/track/`` and
        ``/album/`` URLs, and an ``/album/`` URL is itself a container. If such a
        URL were emitted as a locator, the fetcher would (mis)download the whole
        album into a single file and tag it with album-level, artist-less
        metadata — the "Unknown artist" bug. So when the picked entry is itself a
        container we recurse into it, bounded by ``_MAX_EXPAND_DEPTH``, until the
        locator is a single downloadable track.
        A real track flat-extracts to just itself, which is the recursion's base
        case. That leaf extraction is a full (non-flat) metadata fetch, so the
        entry carries the track's real ``title``/``artist``; it is cached, so the
        cost is paid once per refresh window.
        """
        entries = self._entries_for(src_url)
        if not entries:
            return None
        if len(entries) == 1 and entries[0]["url"] == src_url:
            return entries[0]  # leaf: a track resolves to itself
        pool = [e for e in entries if e["url"] not in recent] or entries
        entry = random.choice(pool)
        url = entry["url"]
        if url == src_url or depth >= self._MAX_EXPAND_DEPTH:
            return entry
        # The picked entry may itself be a container (an album); drill into it.
        # Fall back to the entry as-is if that turns up nothing usable.
        return self._pick_entry(url, recent, depth + 1) or entry
    def _resolved_tags(self, locator: str):
        """Corrected identity a previous download wrote into the cached file.
--- a/ingest/radieo/tagging.py
+++ b/ingest/radieo/tagging.py
@ -41,21 +41,28 @@ def is_bandcamp(info: dict, locator: str) -> bool:
 def guess_metadata(info: dict, source_url: str | None = None) -> TagMeta:
    """Derive tags from a yt-dlp *download* info dict.
-    Priority mirrors the user's shell heuristic: the track title usually carries
+    On a bandcamp download the reliable signal is the explicit ``artist`` tag
-    ``"Artist - Title"`` (split on the *last* ``" - "``), which is more reliable
+    (the performing artist, *not* the label the way ``uploader`` is). Label
-    than the uploader on label accounts. When the title has no separator we fall
+    uploads name the ``title`` ``"Artist - Title"`` — sometimes with the artist
-    back to the explicit ``artist`` field, preferring it over the uploader/label.
+    doubled (``"Artist - Artist - Title"``) — so when that artist tag prefixes
    the title we peel it off rather than guessing where to split. Only when
    there is no such artist tag do we fall back to splitting the title on the
    last ``" - "`` (and then to the uploader/label as a last resort).
    """
    raw_title = (info.get("title") or "").strip()
-    artist, title = _split_artist_title(raw_title)
+    reliable = (info.get("artist") or info.get("creator") or "").strip()
-    if not artist:
+    prefix = f"{reliable} - "
-        artist = (
+    if reliable and raw_title.startswith(prefix):
-            info.get("artist")
+        artist = reliable
-            or info.get("creator")
+        title = raw_title
-            or info.get("uploader")
+        while title.startswith(prefix):  # peel a doubled "Artist - " prefix too
-            or info.get("channel")
+            title = title[len(prefix):].strip()
-            or ""
+    else:
-        ).strip()
+        artist, title = _split_artist_title(raw_title)
        if not artist:
            artist = reliable or (
                info.get("uploader") or info.get("channel") or ""
            ).strip()
    if not title:
        title = raw_title or (info.get("track") or "").strip()