ingest: resolve bandcamp label sources down to individual tracks
A yt-dlp source pointing at a label/artist page flat-extracts to a mix of
/track/ and /album/ URLs. The provider used each verbatim as a locator, so an
/album/ URL was handed to the fetcher as if it were a track: yt-dlp then
(mis)downloaded the whole album into one file and tagged it from the
playlist-level info, which carries the album title and no artist — surfacing as
"Unknown artist" on the stream.
Drill picked container entries down to a single track before emitting a
locator, bounded by a small depth so nested containers (label -> album ->
track) resolve while a real track (which flat-extracts to just itself) is the
base case. Locators are now always downloadable tracks, so the existing
tag/fetch path and anti-repeat keying work as intended.
Also make guess_metadata trust the explicit artist tag over the "Artist -
Title" title split: some label uploads double the artist into the title
("Artist - Artist - Title"), which the blind last-" - " split mis-parsed. When
that artist prefixes the title we peel it off (repeatedly), falling back to the
split only when there is no artist tag.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
a65cc61ccd
commit
032a9b86b8
2 changed files with 72 additions and 20 deletions
|
|
@ -28,6 +28,11 @@ log = logging.getLogger("radieo.provider.ytdlp")
|
|||
class YtdlpProvider:
|
||||
name = "ytdlp"
|
||||
|
||||
# How many container levels (label -> album -> track) to expand while
|
||||
# drilling down to a single track. One flat extraction of a label page is
|
||||
# not enough: its /album/ entries are themselves containers.
|
||||
_MAX_EXPAND_DEPTH = 3
|
||||
|
||||
def __init__(self, urls_file: Path, db: Database, cache_dir: Path):
|
||||
self._urls_file = urls_file
|
||||
self._db = db
|
||||
|
|
@ -77,26 +82,66 @@ class YtdlpProvider:
|
|||
|
||||
def _resolve(self, src_url: str, recent: set[str]) -> Track | None:
|
||||
try:
|
||||
entries = self._entries_for(src_url)
|
||||
entry = self._pick_entry(src_url, recent, 0)
|
||||
except Exception as exc: # yt-dlp raises many extractor-specific errors
|
||||
log.warning("yt-dlp could not resolve %s: %s", src_url, exc)
|
||||
return None
|
||||
if not entries:
|
||||
if entry is None:
|
||||
return None
|
||||
pool = [e for e in entries if e["url"] not in recent] or entries
|
||||
entry = random.choice(pool)
|
||||
locator = entry["url"]
|
||||
meta = self._resolved_tags(locator)
|
||||
if meta is not None: # a previous download already wrote real tags
|
||||
artist, title, mbid = meta.artist, meta.title, meta.mbid
|
||||
else:
|
||||
# Split the entry's "Artist - Title" the same way the fetcher will
|
||||
# after downloading, so the scheduler's canonical anti-repeat keys
|
||||
# this provisional track the same before and after the download.
|
||||
guess = tagging.guess_metadata(
|
||||
{"title": entry.get("title"), "artist": entry.get("artist")}
|
||||
)
|
||||
artist = guess.artist
|
||||
title = guess.title if entry.get("title") else locator
|
||||
mbid = None
|
||||
return Track(
|
||||
backend="ytdlp",
|
||||
locator=locator,
|
||||
artist=(meta.artist if meta else entry.get("artist")) or "Unknown artist",
|
||||
title=(meta.title if meta else entry.get("title")) or locator,
|
||||
artist=artist,
|
||||
title=title,
|
||||
origin=self.name,
|
||||
mbid=meta.mbid if meta else None,
|
||||
mbid=mbid,
|
||||
source_url=src_url if src_url != locator else None,
|
||||
)
|
||||
|
||||
def _pick_entry(self, src_url: str, recent: set[str], depth: int) -> dict | None:
|
||||
"""Pick one *track* entry under ``src_url``, drilling through containers.
|
||||
|
||||
A bandcamp label/artist page flat-extracts to a mix of ``/track/`` and
|
||||
``/album/`` URLs, and an ``/album/`` URL is itself a container. If such a
|
||||
URL were emitted as a locator, the fetcher would (mis)download the whole
|
||||
album into a single file and tag it with album-level, artist-less
|
||||
metadata — the "Unknown artist" bug. So when the picked entry is itself a
|
||||
container we recurse into it, bounded by ``_MAX_EXPAND_DEPTH``, until the
|
||||
locator is a single downloadable track.
|
||||
|
||||
A real track flat-extracts to just itself, which is the recursion's base
|
||||
case. That leaf extraction is a full (non-flat) metadata fetch, so the
|
||||
entry carries the track's real ``title``/``artist``; it is cached, so the
|
||||
cost is paid once per refresh window.
|
||||
"""
|
||||
entries = self._entries_for(src_url)
|
||||
if not entries:
|
||||
return None
|
||||
if len(entries) == 1 and entries[0]["url"] == src_url:
|
||||
return entries[0] # leaf: a track resolves to itself
|
||||
pool = [e for e in entries if e["url"] not in recent] or entries
|
||||
entry = random.choice(pool)
|
||||
url = entry["url"]
|
||||
if url == src_url or depth >= self._MAX_EXPAND_DEPTH:
|
||||
return entry
|
||||
# The picked entry may itself be a container (an album); drill into it.
|
||||
# Fall back to the entry as-is if that turns up nothing usable.
|
||||
return self._pick_entry(url, recent, depth + 1) or entry
|
||||
|
||||
def _resolved_tags(self, locator: str):
|
||||
"""Corrected identity a previous download wrote into the cached file.
|
||||
|
||||
|
|
|
|||
|
|
@ -41,21 +41,28 @@ def is_bandcamp(info: dict, locator: str) -> bool:
|
|||
def guess_metadata(info: dict, source_url: str | None = None) -> TagMeta:
|
||||
"""Derive tags from a yt-dlp *download* info dict.
|
||||
|
||||
Priority mirrors the user's shell heuristic: the track title usually carries
|
||||
``"Artist - Title"`` (split on the *last* ``" - "``), which is more reliable
|
||||
than the uploader on label accounts. When the title has no separator we fall
|
||||
back to the explicit ``artist`` field, preferring it over the uploader/label.
|
||||
On a bandcamp download the reliable signal is the explicit ``artist`` tag
|
||||
(the performing artist, *not* the label the way ``uploader`` is). Label
|
||||
uploads name the ``title`` ``"Artist - Title"`` — sometimes with the artist
|
||||
doubled (``"Artist - Artist - Title"``) — so when that artist tag prefixes
|
||||
the title we peel it off rather than guessing where to split. Only when
|
||||
there is no such artist tag do we fall back to splitting the title on the
|
||||
last ``" - "`` (and then to the uploader/label as a last resort).
|
||||
"""
|
||||
raw_title = (info.get("title") or "").strip()
|
||||
artist, title = _split_artist_title(raw_title)
|
||||
if not artist:
|
||||
artist = (
|
||||
info.get("artist")
|
||||
or info.get("creator")
|
||||
or info.get("uploader")
|
||||
or info.get("channel")
|
||||
or ""
|
||||
).strip()
|
||||
reliable = (info.get("artist") or info.get("creator") or "").strip()
|
||||
prefix = f"{reliable} - "
|
||||
if reliable and raw_title.startswith(prefix):
|
||||
artist = reliable
|
||||
title = raw_title
|
||||
while title.startswith(prefix): # peel a doubled "Artist - " prefix too
|
||||
title = title[len(prefix):].strip()
|
||||
else:
|
||||
artist, title = _split_artist_title(raw_title)
|
||||
if not artist:
|
||||
artist = reliable or (
|
||||
info.get("uploader") or info.get("channel") or ""
|
||||
).strip()
|
||||
if not title:
|
||||
title = raw_title or (info.get("track") or "").strip()
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue