From 032a9b86b8bd74261f125461737d32d1dbb615cd Mon Sep 17 00:00:00 2001 From: Pierre-Olivier Mercier Date: Fri, 3 Jul 2026 12:48:45 +0800 Subject: [PATCH] ingest: resolve bandcamp label sources down to individual tracks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A yt-dlp source pointing at a label/artist page flat-extracts to a mix of /track/ and /album/ URLs. The provider used each verbatim as a locator, so an /album/ URL was handed to the fetcher as if it were a track: yt-dlp then (mis)downloaded the whole album into one file and tagged it from the playlist-level info, which carries the album title and no artist — surfacing as "Unknown artist" on the stream. Drill picked container entries down to a single track before emitting a locator, bounded by a small depth so nested containers (label -> album -> track) resolve while a real track (which flat-extracts to just itself) is the base case. Locators are now always downloadable tracks, so the existing tag/fetch path and anti-repeat keying work as intended. Also make guess_metadata trust the explicit artist tag over the "Artist - Title" title split: some label uploads double the artist into the title ("Artist - Artist - Title"), which the blind last-" - " split mis-parsed. When that artist prefixes the title we peel it off (repeatedly), falling back to the split only when there is no artist tag. Co-Authored-By: Claude Opus 4.8 --- ingest/radieo/providers/ytdlp.py | 59 ++++++++++++++++++++++++++++---- ingest/radieo/tagging.py | 33 +++++++++++------- 2 files changed, 72 insertions(+), 20 deletions(-) diff --git a/ingest/radieo/providers/ytdlp.py b/ingest/radieo/providers/ytdlp.py index 77f1e3a..c106e1d 100644 --- a/ingest/radieo/providers/ytdlp.py +++ b/ingest/radieo/providers/ytdlp.py @@ -28,6 +28,11 @@ log = logging.getLogger("radieo.provider.ytdlp") class YtdlpProvider: name = "ytdlp" + # How many container levels (label -> album -> track) to expand while + # drilling down to a single track. One flat extraction of a label page is + # not enough: its /album/ entries are themselves containers. + _MAX_EXPAND_DEPTH = 3 + def __init__(self, urls_file: Path, db: Database, cache_dir: Path): self._urls_file = urls_file self._db = db @@ -77,26 +82,66 @@ class YtdlpProvider: def _resolve(self, src_url: str, recent: set[str]) -> Track | None: try: - entries = self._entries_for(src_url) + entry = self._pick_entry(src_url, recent, 0) except Exception as exc: # yt-dlp raises many extractor-specific errors log.warning("yt-dlp could not resolve %s: %s", src_url, exc) return None - if not entries: + if entry is None: return None - pool = [e for e in entries if e["url"] not in recent] or entries - entry = random.choice(pool) locator = entry["url"] meta = self._resolved_tags(locator) + if meta is not None: # a previous download already wrote real tags + artist, title, mbid = meta.artist, meta.title, meta.mbid + else: + # Split the entry's "Artist - Title" the same way the fetcher will + # after downloading, so the scheduler's canonical anti-repeat keys + # this provisional track the same before and after the download. + guess = tagging.guess_metadata( + {"title": entry.get("title"), "artist": entry.get("artist")} + ) + artist = guess.artist + title = guess.title if entry.get("title") else locator + mbid = None return Track( backend="ytdlp", locator=locator, - artist=(meta.artist if meta else entry.get("artist")) or "Unknown artist", - title=(meta.title if meta else entry.get("title")) or locator, + artist=artist, + title=title, origin=self.name, - mbid=meta.mbid if meta else None, + mbid=mbid, source_url=src_url if src_url != locator else None, ) + def _pick_entry(self, src_url: str, recent: set[str], depth: int) -> dict | None: + """Pick one *track* entry under ``src_url``, drilling through containers. + + A bandcamp label/artist page flat-extracts to a mix of ``/track/`` and + ``/album/`` URLs, and an ``/album/`` URL is itself a container. If such a + URL were emitted as a locator, the fetcher would (mis)download the whole + album into a single file and tag it with album-level, artist-less + metadata — the "Unknown artist" bug. So when the picked entry is itself a + container we recurse into it, bounded by ``_MAX_EXPAND_DEPTH``, until the + locator is a single downloadable track. + + A real track flat-extracts to just itself, which is the recursion's base + case. That leaf extraction is a full (non-flat) metadata fetch, so the + entry carries the track's real ``title``/``artist``; it is cached, so the + cost is paid once per refresh window. + """ + entries = self._entries_for(src_url) + if not entries: + return None + if len(entries) == 1 and entries[0]["url"] == src_url: + return entries[0] # leaf: a track resolves to itself + pool = [e for e in entries if e["url"] not in recent] or entries + entry = random.choice(pool) + url = entry["url"] + if url == src_url or depth >= self._MAX_EXPAND_DEPTH: + return entry + # The picked entry may itself be a container (an album); drill into it. + # Fall back to the entry as-is if that turns up nothing usable. + return self._pick_entry(url, recent, depth + 1) or entry + def _resolved_tags(self, locator: str): """Corrected identity a previous download wrote into the cached file. diff --git a/ingest/radieo/tagging.py b/ingest/radieo/tagging.py index 7e9a1ce..c0941f8 100644 --- a/ingest/radieo/tagging.py +++ b/ingest/radieo/tagging.py @@ -41,21 +41,28 @@ def is_bandcamp(info: dict, locator: str) -> bool: def guess_metadata(info: dict, source_url: str | None = None) -> TagMeta: """Derive tags from a yt-dlp *download* info dict. - Priority mirrors the user's shell heuristic: the track title usually carries - ``"Artist - Title"`` (split on the *last* ``" - "``), which is more reliable - than the uploader on label accounts. When the title has no separator we fall - back to the explicit ``artist`` field, preferring it over the uploader/label. + On a bandcamp download the reliable signal is the explicit ``artist`` tag + (the performing artist, *not* the label the way ``uploader`` is). Label + uploads name the ``title`` ``"Artist - Title"`` — sometimes with the artist + doubled (``"Artist - Artist - Title"``) — so when that artist tag prefixes + the title we peel it off rather than guessing where to split. Only when + there is no such artist tag do we fall back to splitting the title on the + last ``" - "`` (and then to the uploader/label as a last resort). """ raw_title = (info.get("title") or "").strip() - artist, title = _split_artist_title(raw_title) - if not artist: - artist = ( - info.get("artist") - or info.get("creator") - or info.get("uploader") - or info.get("channel") - or "" - ).strip() + reliable = (info.get("artist") or info.get("creator") or "").strip() + prefix = f"{reliable} - " + if reliable and raw_title.startswith(prefix): + artist = reliable + title = raw_title + while title.startswith(prefix): # peel a doubled "Artist - " prefix too + title = title[len(prefix):].strip() + else: + artist, title = _split_artist_title(raw_title) + if not artist: + artist = reliable or ( + info.get("uploader") or info.get("channel") or "" + ).strip() if not title: title = raw_title or (info.get("track") or "").strip()