ingest: resolve bandcamp label sources down to individual tracks
A yt-dlp source pointing at a label/artist page flat-extracts to a mix of
/track/ and /album/ URLs. The provider used each verbatim as a locator, so an
/album/ URL was handed to the fetcher as if it were a track: yt-dlp then
(mis)downloaded the whole album into one file and tagged it from the
playlist-level info, which carries the album title and no artist — surfacing as
"Unknown artist" on the stream.
Drill picked container entries down to a single track before emitting a
locator, bounded by a small depth so nested containers (label -> album ->
track) resolve while a real track (which flat-extracts to just itself) is the
base case. Locators are now always downloadable tracks, so the existing
tag/fetch path and anti-repeat keying work as intended.
Also make guess_metadata trust the explicit artist tag over the "Artist -
Title" title split: some label uploads double the artist into the title
("Artist - Artist - Title"), which the blind last-" - " split mis-parsed. When
that artist prefixes the title we peel it off (repeatedly), falling back to the
split only when there is no artist tag.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
a65cc61ccd
commit
032a9b86b8
2 changed files with 72 additions and 20 deletions
|
|
@ -28,6 +28,11 @@ log = logging.getLogger("radieo.provider.ytdlp")
|
||||||
class YtdlpProvider:
|
class YtdlpProvider:
|
||||||
name = "ytdlp"
|
name = "ytdlp"
|
||||||
|
|
||||||
|
# How many container levels (label -> album -> track) to expand while
|
||||||
|
# drilling down to a single track. One flat extraction of a label page is
|
||||||
|
# not enough: its /album/ entries are themselves containers.
|
||||||
|
_MAX_EXPAND_DEPTH = 3
|
||||||
|
|
||||||
def __init__(self, urls_file: Path, db: Database, cache_dir: Path):
|
def __init__(self, urls_file: Path, db: Database, cache_dir: Path):
|
||||||
self._urls_file = urls_file
|
self._urls_file = urls_file
|
||||||
self._db = db
|
self._db = db
|
||||||
|
|
@ -77,26 +82,66 @@ class YtdlpProvider:
|
||||||
|
|
||||||
def _resolve(self, src_url: str, recent: set[str]) -> Track | None:
|
def _resolve(self, src_url: str, recent: set[str]) -> Track | None:
|
||||||
try:
|
try:
|
||||||
entries = self._entries_for(src_url)
|
entry = self._pick_entry(src_url, recent, 0)
|
||||||
except Exception as exc: # yt-dlp raises many extractor-specific errors
|
except Exception as exc: # yt-dlp raises many extractor-specific errors
|
||||||
log.warning("yt-dlp could not resolve %s: %s", src_url, exc)
|
log.warning("yt-dlp could not resolve %s: %s", src_url, exc)
|
||||||
return None
|
return None
|
||||||
if not entries:
|
if entry is None:
|
||||||
return None
|
return None
|
||||||
pool = [e for e in entries if e["url"] not in recent] or entries
|
|
||||||
entry = random.choice(pool)
|
|
||||||
locator = entry["url"]
|
locator = entry["url"]
|
||||||
meta = self._resolved_tags(locator)
|
meta = self._resolved_tags(locator)
|
||||||
|
if meta is not None: # a previous download already wrote real tags
|
||||||
|
artist, title, mbid = meta.artist, meta.title, meta.mbid
|
||||||
|
else:
|
||||||
|
# Split the entry's "Artist - Title" the same way the fetcher will
|
||||||
|
# after downloading, so the scheduler's canonical anti-repeat keys
|
||||||
|
# this provisional track the same before and after the download.
|
||||||
|
guess = tagging.guess_metadata(
|
||||||
|
{"title": entry.get("title"), "artist": entry.get("artist")}
|
||||||
|
)
|
||||||
|
artist = guess.artist
|
||||||
|
title = guess.title if entry.get("title") else locator
|
||||||
|
mbid = None
|
||||||
return Track(
|
return Track(
|
||||||
backend="ytdlp",
|
backend="ytdlp",
|
||||||
locator=locator,
|
locator=locator,
|
||||||
artist=(meta.artist if meta else entry.get("artist")) or "Unknown artist",
|
artist=artist,
|
||||||
title=(meta.title if meta else entry.get("title")) or locator,
|
title=title,
|
||||||
origin=self.name,
|
origin=self.name,
|
||||||
mbid=meta.mbid if meta else None,
|
mbid=mbid,
|
||||||
source_url=src_url if src_url != locator else None,
|
source_url=src_url if src_url != locator else None,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _pick_entry(self, src_url: str, recent: set[str], depth: int) -> dict | None:
|
||||||
|
"""Pick one *track* entry under ``src_url``, drilling through containers.
|
||||||
|
|
||||||
|
A bandcamp label/artist page flat-extracts to a mix of ``/track/`` and
|
||||||
|
``/album/`` URLs, and an ``/album/`` URL is itself a container. If such a
|
||||||
|
URL were emitted as a locator, the fetcher would (mis)download the whole
|
||||||
|
album into a single file and tag it with album-level, artist-less
|
||||||
|
metadata — the "Unknown artist" bug. So when the picked entry is itself a
|
||||||
|
container we recurse into it, bounded by ``_MAX_EXPAND_DEPTH``, until the
|
||||||
|
locator is a single downloadable track.
|
||||||
|
|
||||||
|
A real track flat-extracts to just itself, which is the recursion's base
|
||||||
|
case. That leaf extraction is a full (non-flat) metadata fetch, so the
|
||||||
|
entry carries the track's real ``title``/``artist``; it is cached, so the
|
||||||
|
cost is paid once per refresh window.
|
||||||
|
"""
|
||||||
|
entries = self._entries_for(src_url)
|
||||||
|
if not entries:
|
||||||
|
return None
|
||||||
|
if len(entries) == 1 and entries[0]["url"] == src_url:
|
||||||
|
return entries[0] # leaf: a track resolves to itself
|
||||||
|
pool = [e for e in entries if e["url"] not in recent] or entries
|
||||||
|
entry = random.choice(pool)
|
||||||
|
url = entry["url"]
|
||||||
|
if url == src_url or depth >= self._MAX_EXPAND_DEPTH:
|
||||||
|
return entry
|
||||||
|
# The picked entry may itself be a container (an album); drill into it.
|
||||||
|
# Fall back to the entry as-is if that turns up nothing usable.
|
||||||
|
return self._pick_entry(url, recent, depth + 1) or entry
|
||||||
|
|
||||||
def _resolved_tags(self, locator: str):
|
def _resolved_tags(self, locator: str):
|
||||||
"""Corrected identity a previous download wrote into the cached file.
|
"""Corrected identity a previous download wrote into the cached file.
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -41,21 +41,28 @@ def is_bandcamp(info: dict, locator: str) -> bool:
|
||||||
def guess_metadata(info: dict, source_url: str | None = None) -> TagMeta:
|
def guess_metadata(info: dict, source_url: str | None = None) -> TagMeta:
|
||||||
"""Derive tags from a yt-dlp *download* info dict.
|
"""Derive tags from a yt-dlp *download* info dict.
|
||||||
|
|
||||||
Priority mirrors the user's shell heuristic: the track title usually carries
|
On a bandcamp download the reliable signal is the explicit ``artist`` tag
|
||||||
``"Artist - Title"`` (split on the *last* ``" - "``), which is more reliable
|
(the performing artist, *not* the label the way ``uploader`` is). Label
|
||||||
than the uploader on label accounts. When the title has no separator we fall
|
uploads name the ``title`` ``"Artist - Title"`` — sometimes with the artist
|
||||||
back to the explicit ``artist`` field, preferring it over the uploader/label.
|
doubled (``"Artist - Artist - Title"``) — so when that artist tag prefixes
|
||||||
|
the title we peel it off rather than guessing where to split. Only when
|
||||||
|
there is no such artist tag do we fall back to splitting the title on the
|
||||||
|
last ``" - "`` (and then to the uploader/label as a last resort).
|
||||||
"""
|
"""
|
||||||
raw_title = (info.get("title") or "").strip()
|
raw_title = (info.get("title") or "").strip()
|
||||||
artist, title = _split_artist_title(raw_title)
|
reliable = (info.get("artist") or info.get("creator") or "").strip()
|
||||||
if not artist:
|
prefix = f"{reliable} - "
|
||||||
artist = (
|
if reliable and raw_title.startswith(prefix):
|
||||||
info.get("artist")
|
artist = reliable
|
||||||
or info.get("creator")
|
title = raw_title
|
||||||
or info.get("uploader")
|
while title.startswith(prefix): # peel a doubled "Artist - " prefix too
|
||||||
or info.get("channel")
|
title = title[len(prefix):].strip()
|
||||||
or ""
|
else:
|
||||||
).strip()
|
artist, title = _split_artist_title(raw_title)
|
||||||
|
if not artist:
|
||||||
|
artist = reliable or (
|
||||||
|
info.get("uploader") or info.get("channel") or ""
|
||||||
|
).strip()
|
||||||
if not title:
|
if not title:
|
||||||
title = raw_title or (info.get("track") or "").strip()
|
title = raw_title or (info.get("track") or "").strip()
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue