ingest: resolve bandcamp label sources down to individual tracks

A yt-dlp source pointing at a label/artist page flat-extracts to a mix of
/track/ and /album/ URLs. The provider used each verbatim as a locator, so an
/album/ URL was handed to the fetcher as if it were a track: yt-dlp then
(mis)downloaded the whole album into one file and tagged it from the
playlist-level info, which carries the album title and no artist — surfacing as
"Unknown artist" on the stream.

Drill picked container entries down to a single track before emitting a
locator, bounded by a small depth so nested containers (label -> album ->
track) resolve while a real track (which flat-extracts to just itself) is the
base case. Locators are now always downloadable tracks, so the existing
tag/fetch path and anti-repeat keying work as intended.

Also make guess_metadata trust the explicit artist tag over the "Artist -
Title" title split: some label uploads double the artist into the title
("Artist - Artist - Title"), which the blind last-" - " split mis-parsed. When
that artist prefixes the title we peel it off (repeatedly), falling back to the
split only when there is no artist tag.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
nemunaire 2026-07-03 12:48:45 +08:00
commit 032a9b86b8
2 changed files with 72 additions and 20 deletions

View file

@ -28,6 +28,11 @@ log = logging.getLogger("radieo.provider.ytdlp")
class YtdlpProvider:
name = "ytdlp"
# How many container levels (label -> album -> track) to expand while
# drilling down to a single track. One flat extraction of a label page is
# not enough: its /album/ entries are themselves containers.
_MAX_EXPAND_DEPTH = 3
def __init__(self, urls_file: Path, db: Database, cache_dir: Path):
self._urls_file = urls_file
self._db = db
@ -77,26 +82,66 @@ class YtdlpProvider:
def _resolve(self, src_url: str, recent: set[str]) -> Track | None:
try:
entries = self._entries_for(src_url)
entry = self._pick_entry(src_url, recent, 0)
except Exception as exc: # yt-dlp raises many extractor-specific errors
log.warning("yt-dlp could not resolve %s: %s", src_url, exc)
return None
if not entries:
if entry is None:
return None
pool = [e for e in entries if e["url"] not in recent] or entries
entry = random.choice(pool)
locator = entry["url"]
meta = self._resolved_tags(locator)
if meta is not None: # a previous download already wrote real tags
artist, title, mbid = meta.artist, meta.title, meta.mbid
else:
# Split the entry's "Artist - Title" the same way the fetcher will
# after downloading, so the scheduler's canonical anti-repeat keys
# this provisional track the same before and after the download.
guess = tagging.guess_metadata(
{"title": entry.get("title"), "artist": entry.get("artist")}
)
artist = guess.artist
title = guess.title if entry.get("title") else locator
mbid = None
return Track(
backend="ytdlp",
locator=locator,
artist=(meta.artist if meta else entry.get("artist")) or "Unknown artist",
title=(meta.title if meta else entry.get("title")) or locator,
artist=artist,
title=title,
origin=self.name,
mbid=meta.mbid if meta else None,
mbid=mbid,
source_url=src_url if src_url != locator else None,
)
def _pick_entry(self, src_url: str, recent: set[str], depth: int) -> dict | None:
"""Pick one *track* entry under ``src_url``, drilling through containers.
A bandcamp label/artist page flat-extracts to a mix of ``/track/`` and
``/album/`` URLs, and an ``/album/`` URL is itself a container. If such a
URL were emitted as a locator, the fetcher would (mis)download the whole
album into a single file and tag it with album-level, artist-less
metadata the "Unknown artist" bug. So when the picked entry is itself a
container we recurse into it, bounded by ``_MAX_EXPAND_DEPTH``, until the
locator is a single downloadable track.
A real track flat-extracts to just itself, which is the recursion's base
case. That leaf extraction is a full (non-flat) metadata fetch, so the
entry carries the track's real ``title``/``artist``; it is cached, so the
cost is paid once per refresh window.
"""
entries = self._entries_for(src_url)
if not entries:
return None
if len(entries) == 1 and entries[0]["url"] == src_url:
return entries[0] # leaf: a track resolves to itself
pool = [e for e in entries if e["url"] not in recent] or entries
entry = random.choice(pool)
url = entry["url"]
if url == src_url or depth >= self._MAX_EXPAND_DEPTH:
return entry
# The picked entry may itself be a container (an album); drill into it.
# Fall back to the entry as-is if that turns up nothing usable.
return self._pick_entry(url, recent, depth + 1) or entry
def _resolved_tags(self, locator: str):
"""Corrected identity a previous download wrote into the cached file.

View file

@ -41,21 +41,28 @@ def is_bandcamp(info: dict, locator: str) -> bool:
def guess_metadata(info: dict, source_url: str | None = None) -> TagMeta:
"""Derive tags from a yt-dlp *download* info dict.
Priority mirrors the user's shell heuristic: the track title usually carries
``"Artist - Title"`` (split on the *last* ``" - "``), which is more reliable
than the uploader on label accounts. When the title has no separator we fall
back to the explicit ``artist`` field, preferring it over the uploader/label.
On a bandcamp download the reliable signal is the explicit ``artist`` tag
(the performing artist, *not* the label the way ``uploader`` is). Label
uploads name the ``title`` ``"Artist - Title"`` sometimes with the artist
doubled (``"Artist - Artist - Title"``) so when that artist tag prefixes
the title we peel it off rather than guessing where to split. Only when
there is no such artist tag do we fall back to splitting the title on the
last ``" - "`` (and then to the uploader/label as a last resort).
"""
raw_title = (info.get("title") or "").strip()
artist, title = _split_artist_title(raw_title)
if not artist:
artist = (
info.get("artist")
or info.get("creator")
or info.get("uploader")
or info.get("channel")
or ""
).strip()
reliable = (info.get("artist") or info.get("creator") or "").strip()
prefix = f"{reliable} - "
if reliable and raw_title.startswith(prefix):
artist = reliable
title = raw_title
while title.startswith(prefix): # peel a doubled "Artist - " prefix too
title = title[len(prefix):].strip()
else:
artist, title = _split_artist_title(raw_title)
if not artist:
artist = reliable or (
info.get("uploader") or info.get("channel") or ""
).strip()
if not title:
title = raw_title or (info.get("track") or "").strip()