ingest: tag bandcamp downloads and correct label-account artists
All checks were successful
continuous-integration/drone/push Build is passing

yt-dlp's flat extraction mis-attributes bandcamp uploads: on a label
account the artist becomes the label, and many files ship untagged. After
a download, guess (artist, title, album) from the richer download metadata
and the "Artist - Title" filename shape, confirm against MusicBrainz, and
write ID3 tags into the file. Existing tags are respected; MusicBrainz is
primary, filename parsing only a fallback.

The file's tags are the single source of truth: the provider reads them
back on the next pick, so the corrected identity (incl. MBID) is in place
before the scheduler keys and anti-repeat-checks the track, and cache
reuse inherits it too.

Fetchers now return (path, Track) so the corrected metadata flows back.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
nemunaire 2026-07-03 11:01:24 +08:00
commit fbdb2d6bb3
8 changed files with 346 additions and 36 deletions

View file

@ -19,14 +19,18 @@ class _NullCanonicalizer:
def canonicalize(self, track): def canonicalize(self, track):
return track return track
def identify(self, artist, title):
return None, None, None
def close(self): def close(self):
pass pass
def _build_pipeline(db: Database): def _build_pipeline(db: Database, canonicalizer):
"""Return (providers, fetchers). Assembles whichever sources are enabled; """Return (providers, fetchers). Assembles whichever sources are enabled;
when none is, the scheduler yields nothing and the stream plays its local when none is, the scheduler yields nothing and the stream plays its local
cache fallback.""" cache fallback. ``canonicalizer`` is handed to the yt-dlp fetcher so it can
confirm guessed bandcamp tags against MusicBrainz."""
providers = [] # list[(provider, weight)] providers = [] # list[(provider, weight)]
fetchers = {} # backend name -> fetcher fetchers = {} # backend name -> fetcher
subsonic_client = None # reused by ListenBrainz for resolution subsonic_client = None # reused by ListenBrainz for resolution
@ -54,9 +58,9 @@ def _build_pipeline(db: Database):
from .fetchers.ytdlp import YtdlpFetcher from .fetchers.ytdlp import YtdlpFetcher
from .providers.ytdlp import YtdlpProvider from .providers.ytdlp import YtdlpProvider
provider = YtdlpProvider(config.YTDLP_URLS_FILE, db) provider = YtdlpProvider(config.YTDLP_URLS_FILE, db, config.CACHE_DIR)
providers.append((provider, config.SOURCE_WEIGHTS["ytdlp"])) providers.append((provider, config.SOURCE_WEIGHTS["ytdlp"]))
fetchers["ytdlp"] = YtdlpFetcher(config.CACHE_DIR) fetchers["ytdlp"] = YtdlpFetcher(config.CACHE_DIR, canonicalizer)
log.info( log.info(
"yt-dlp source enabled (urls=%s, weight=%d)", "yt-dlp source enabled (urls=%s, weight=%d)",
config.YTDLP_URLS_FILE, config.YTDLP_URLS_FILE,
@ -78,7 +82,7 @@ def _build_pipeline(db: Database):
if "ytdlp" not in fetchers: if "ytdlp" not in fetchers:
from .fetchers.ytdlp import YtdlpFetcher from .fetchers.ytdlp import YtdlpFetcher
fetchers["ytdlp"] = YtdlpFetcher(config.CACHE_DIR) fetchers["ytdlp"] = YtdlpFetcher(config.CACHE_DIR, canonicalizer)
provider = ListenBrainzProvider( provider = ListenBrainzProvider(
config.LISTENBRAINZ_URL, config.LISTENBRAINZ_URL,
db, db,
@ -148,7 +152,7 @@ def main() -> None:
canonicalizer = _NullCanonicalizer() canonicalizer = _NullCanonicalizer()
log.info("Canonicalizer disabled: tracks keyed by (artist, title).") log.info("Canonicalizer disabled: tracks keyed by (artist, title).")
providers, fetchers = _build_pipeline(db) providers, fetchers = _build_pipeline(db, canonicalizer)
scheduler = Scheduler(providers, canonicalizer, db) scheduler = Scheduler(providers, canonicalizer, db)
queue = TrackQueue(scheduler, fetchers, db) queue = TrackQueue(scheduler, fetchers, db)
queue.start() queue.start()

View file

@ -49,15 +49,49 @@ class Canonicalizer:
cached, mbid = self._db.get_canonical(artist_norm, title_norm) cached, mbid = self._db.get_canonical(artist_norm, title_norm)
if not cached: if not cached:
ok, mbid = self._lookup(track.artist, track.title) ok, rec = self._search(track.artist, track.title)
if ok: # cache hits and genuine misses; skip transient errors if ok: # cache hits and genuine misses; skip transient errors
mbid = rec.get("id") if rec else None
log.info(
"%s for %s%s",
f"MBID {mbid}" if mbid else "no confident MBID",
track.artist,
track.title,
)
self._db.put_canonical(artist_norm, title_norm, mbid) self._db.put_canonical(artist_norm, title_norm, mbid)
return replace(track, mbid=mbid) if mbid else track return replace(track, mbid=mbid) if mbid else track
def identify(
self, artist: str, title: str
) -> tuple[str | None, str | None, str | None]:
"""Best-effort MusicBrainz identification for tagging.
Returns ``(mbid, canonical_artist, canonical_title)``; any field may be
None. Unlike :meth:`canonicalize` it also hands back the recording's
canonical names so a caller can clean up a mis-tagged file. Warms the
MBID cache as a side effect but is not itself cached, so call it at most
once per download.
"""
artist_norm = norm_name(artist)
title_norm = norm_name(title)
if not artist_norm or not title_norm:
return None, None, None
ok, rec = self._search(artist, title)
if not ok or not rec:
return None, None, None
mbid = rec.get("id")
if mbid:
self._db.put_canonical(artist_norm, title_norm, mbid)
return mbid, _artist_credit(rec), rec.get("title")
# --- MusicBrainz ------------------------------------------------------ # --- MusicBrainz ------------------------------------------------------
def _lookup(self, artist: str, title: str) -> tuple[bool, str | None]: def _search(self, artist: str, title: str) -> tuple[bool, dict | None]:
"""Return (ok, mbid). ``ok`` False on a transient error (do not cache).""" """Query MusicBrainz for the best recording match above the threshold.
Returns ``(ok, recording)``; ``ok`` False on a transient error (caller
must not cache), ``recording`` None on a genuine no-confident-match.
"""
query = f'artist:"{_escape(artist)}" AND recording:"{_escape(title)}"' query = f'artist:"{_escape(artist)}" AND recording:"{_escape(title)}"'
self._throttle() self._throttle()
try: try:
@ -72,14 +106,9 @@ class Canonicalizer:
return False, None return False, None
recordings = data.get("recordings") or [] recordings = data.get("recordings") or []
if recordings: if recordings and int(recordings[0].get("score", 0)) >= config.CANONICAL_MIN_SCORE:
best = recordings[0] return True, recordings[0]
if int(best.get("score", 0)) >= config.CANONICAL_MIN_SCORE: return True, None
mbid = best.get("id")
log.info("MBID %s for %s%s", mbid, artist, title)
return True, mbid
log.info("no confident MBID for %s%s", artist, title)
return True, None # genuine miss: cache it
def _throttle(self) -> None: def _throttle(self) -> None:
with self._rate_lock: with self._rate_lock:
@ -96,3 +125,22 @@ class Canonicalizer:
# quoted phrase. # quoted phrase.
def _escape(value: str) -> str: def _escape(value: str) -> str:
return value.replace("\\", "\\\\").replace('"', '\\"') return value.replace("\\", "\\\\").replace('"', '\\"')
def _artist_credit(rec: dict) -> str | None:
"""Flatten a recording's ``artist-credit`` into a display name.
MusicBrainz splits collaborations into credited parts joined by phrases
(e.g. ``"A"`` + ``" & "`` + ``"B"``); stitch them back together.
"""
credits = rec.get("artist-credit") or []
parts: list[str] = []
for c in credits:
if isinstance(c, str):
parts.append(c)
continue
name = c.get("name") or (c.get("artist") or {}).get("name") or ""
parts.append(name)
parts.append(c.get("joinphrase") or "")
name = "".join(parts).strip()
return name or None

View file

@ -25,12 +25,12 @@ class SubsonicFetcher:
self._client = client self._client = client
self._cache_dir = cache_dir self._cache_dir = cache_dir
def fetch(self, track: Track) -> Path: def fetch(self, track: Track) -> tuple[Path, Track]:
stem = f"subsonic-{_SAFE.sub('_', track.locator)}" stem = f"subsonic-{_SAFE.sub('_', track.locator)}"
# Reuse a still-cached copy rather than downloading again. # Reuse a still-cached copy rather than downloading again.
for existing in self._cache_dir.glob(f"{stem}.*"): for existing in self._cache_dir.glob(f"{stem}.*"):
if not existing.name.endswith(".part"): if not existing.name.endswith(".part"):
return existing return existing, track
tmp = self._cache_dir / f".{uuid4().hex}.part" tmp = self._cache_dir / f".{uuid4().hex}.part"
try: try:
@ -43,4 +43,5 @@ class SubsonicFetcher:
tmp.unlink(missing_ok=True) tmp.unlink(missing_ok=True)
raise raise
log.info("downloaded %s -> %s", track, dest.name) log.info("downloaded %s -> %s", track, dest.name)
return dest # Subsonic files are already tagged by the library server; pass through.
return dest, track

View file

@ -12,27 +12,48 @@ Liquidsoap decodes it via ffmpeg, so no re-encoding is needed.
import hashlib import hashlib
import logging import logging
from dataclasses import replace
from pathlib import Path from pathlib import Path
from uuid import uuid4 from uuid import uuid4
from .. import tagging
from ..models import Track from ..models import Track
log = logging.getLogger("radieo.fetcher.ytdlp") log = logging.getLogger("radieo.fetcher.ytdlp")
def cache_stem(locator: str) -> str:
"""Cache filename stem for a yt-dlp locator (shared with the provider)."""
h = hashlib.sha1(locator.encode()).hexdigest()[:16]
return f"ytdlp-{h}"
def cached_file(cache_dir: Path, locator: str) -> Path | None:
"""The ready (non-.part) cached file for a locator, if one exists."""
for existing in cache_dir.glob(f"{cache_stem(locator)}.*"):
if not existing.name.endswith(".part"):
return existing
return None
class YtdlpFetcher: class YtdlpFetcher:
backend = "ytdlp" backend = "ytdlp"
def __init__(self, cache_dir: Path): def __init__(self, cache_dir: Path, canonicalizer):
self._cache_dir = cache_dir self._cache_dir = cache_dir
# Cross-checks guessed bandcamp tags against MusicBrainz. A
# _NullCanonicalizer stands in when the lookup is disabled, so this is
# always safe to call.
self._canonicalizer = canonicalizer
def fetch(self, track: Track) -> Path: def fetch(self, track: Track) -> tuple[Path, Track]:
h = hashlib.sha1(track.locator.encode()).hexdigest()[:16] stem = cache_stem(track.locator)
stem = f"ytdlp-{h}" # Reuse a still-cached copy rather than downloading again. The provider
# Reuse a still-cached copy rather than downloading again. # already read this file's tags into the Track (see YtdlpProvider), so
for existing in self._cache_dir.glob(f"{stem}.*"): # its metadata is the corrected one.
if not existing.name.endswith(".part"): existing = cached_file(self._cache_dir, track.locator)
return existing if existing is not None:
return existing, track
tmp_base = self._cache_dir / f".{stem}.{uuid4().hex}" tmp_base = self._cache_dir / f".{stem}.{uuid4().hex}"
opts = { opts = {
@ -58,7 +79,56 @@ class YtdlpFetcher:
leftover.unlink(missing_ok=True) leftover.unlink(missing_ok=True)
raise raise
log.info("downloaded %s -> %s", track, dest.name) log.info("downloaded %s -> %s", track, dest.name)
return dest return dest, self._retag(dest, info, track)
def _retag(self, dest: Path, info: dict, track: Track) -> Track:
"""For bandcamp downloads, ensure sane ID3 tags and refine the Track.
Bandcamp label accounts leave the flat-extracted artist pointing at the
label and many uploads ship untagged. Three cases, in order:
- the file is *already* tagged (bandcamp did it right): keep those tags,
just adopt them into the Track so the stream annotation matches;
- otherwise search MusicBrainz from the download metadata/filename and,
on a confident match, write its canonical artist/title;
- failing that, fall back to parsing the filename-style title.
The tags land in the file, which is the single source of truth: the
provider reads them back on the next pick (before the scheduler keys the
track), and a cache-reuse hit inherits them via that same Track. Any
failure leaves the Track untouched.
"""
if not tagging.is_bandcamp(info, track.locator):
return track
try:
existing = tagging.read_tags(dest)
if existing is not None:
log.info("keeping bandcamp tags on %s: %s%s",
dest.name, existing.artist, existing.title)
meta = existing
else:
guess = tagging.guess_metadata(info, track.source_url)
mbid, mb_artist, mb_title = self._canonicalizer.identify(
guess.artist, guess.title
)
if mb_artist and mb_title:
meta = tagging.TagMeta(mb_artist, mb_title, guess.album, mbid)
else: # MusicBrainz had nothing: trust the filename
meta = guess
tagging.write_tags(dest, meta)
log.info(
"tagged %s as %s%s%s",
dest.name,
meta.artist,
meta.title,
f" [{meta.album}]" if meta.album else "",
)
except Exception:
log.exception("tagging failed for %s", dest.name)
return track
return replace(
track, artist=meta.artist, title=meta.title, mbid=meta.mbid or track.mbid
)
def _downloaded_path(self, info: dict, tmp_base: Path) -> Path: def _downloaded_path(self, info: dict, tmp_base: Path) -> Path:
reqs = info.get("requested_downloads") reqs = info.get("requested_downloads")

View file

@ -17,8 +17,9 @@ import random
import time import time
from pathlib import Path from pathlib import Path
from .. import config from .. import config, tagging
from ..db import Database from ..db import Database
from ..fetchers.ytdlp import cached_file
from ..models import Track from ..models import Track
log = logging.getLogger("radieo.provider.ytdlp") log = logging.getLogger("radieo.provider.ytdlp")
@ -27,9 +28,10 @@ log = logging.getLogger("radieo.provider.ytdlp")
class YtdlpProvider: class YtdlpProvider:
name = "ytdlp" name = "ytdlp"
def __init__(self, urls_file: Path, db: Database): def __init__(self, urls_file: Path, db: Database, cache_dir: Path):
self._urls_file = urls_file self._urls_file = urls_file
self._db = db self._db = db
self._cache_dir = cache_dir
self._urls: list[str] = [] self._urls: list[str] = []
self._loaded_at = 0.0 self._loaded_at = 0.0
# source URL -> (entries, loaded_at); entries are normalized dicts. # source URL -> (entries, loaded_at); entries are normalized dicts.
@ -83,15 +85,32 @@ class YtdlpProvider:
return None return None
pool = [e for e in entries if e["url"] not in recent] or entries pool = [e for e in entries if e["url"] not in recent] or entries
entry = random.choice(pool) entry = random.choice(pool)
locator = entry["url"]
meta = self._resolved_tags(locator)
return Track( return Track(
backend="ytdlp", backend="ytdlp",
locator=entry["url"], locator=locator,
artist=entry.get("artist") or "Unknown artist", artist=(meta.artist if meta else entry.get("artist")) or "Unknown artist",
title=entry.get("title") or entry["url"], title=(meta.title if meta else entry.get("title")) or locator,
origin=self.name, origin=self.name,
source_url=src_url if src_url != entry["url"] else None, mbid=meta.mbid if meta else None,
source_url=src_url if src_url != locator else None,
) )
def _resolved_tags(self, locator: str):
"""Corrected identity a previous download wrote into the cached file.
The yt-dlp fetcher fixes bandcamp label accounts and writes the real
artist/title/MBID into the file's tags. Reading them back here — before
the scheduler keys and anti-repeat-checks the track makes selection,
history and cache reuse agree on one identity. Returns None when the
file isn't cached (yet) or carries no usable tags.
"""
if not tagging.is_bandcamp({}, locator):
return None
cached = cached_file(self._cache_dir, locator)
return tagging.read_tags(cached) if cached else None
def _entries_for(self, src_url: str) -> list[dict]: def _entries_for(self, src_url: str) -> list[dict]:
now = time.time() now = time.time()
cached = self._expanded.get(src_url) cached = self._expanded.get(src_url)

View file

@ -64,7 +64,9 @@ class TrackQueue:
log.error("no fetcher for backend %r (%s)", track.backend, track) log.error("no fetcher for backend %r (%s)", track.backend, track)
continue continue
try: try:
path = fetcher.fetch(track) # Fetchers may refine the Track's metadata (e.g. correcting a
# bandcamp label account to the real artist), so take it back.
path, track = fetcher.fetch(track)
except Exception: except Exception:
log.exception("fetch failed for %s", track) log.exception("fetch failed for %s", track)
continue continue

165
ingest/radieo/tagging.py Normal file
View file

@ -0,0 +1,165 @@
"""Guess and write ID3 tags for freshly downloaded bandcamp tracks.
The yt-dlp *flat* extraction the provider uses to pick a track is thin: on a
bandcamp **label** account the uploader is the label rather than the performing
artist, and many bandcamp mp3s ship with no ID3 tags at all. The full metadata
yt-dlp gathers when it actually downloads is richer, so this module re-derives
``(artist, title, album)`` from it trusting the ``"Artist - Title"`` shape of
the track title the way the user's ``id3tag`` one-liner trusts the filename —
and writes the result into the file's tags.
Everything here is best-effort: the artist is *guessed*, then the caller cross-
checks it against MusicBrainz (:meth:`Canonicalizer.identify`) to adopt a
canonical spelling and MBID when one is found confidently. A failure to open or
tag the file just leaves it untagged; playback is unaffected.
"""
import logging
import re
from dataclasses import dataclass
from pathlib import Path
log = logging.getLogger("radieo.tagging")
@dataclass
class TagMeta:
artist: str
title: str
album: str | None = None
mbid: str | None = None
def is_bandcamp(info: dict, locator: str) -> bool:
"""True when the download came from bandcamp (extractor key or host)."""
key = (info.get("extractor_key") or info.get("extractor") or "").lower()
if "bandcamp" in key:
return True
return "bandcamp.com" in (locator or "")
def guess_metadata(info: dict, source_url: str | None = None) -> TagMeta:
"""Derive tags from a yt-dlp *download* info dict.
Priority mirrors the user's shell heuristic: the track title usually carries
``"Artist - Title"`` (split on the *last* ``" - "``), which is more reliable
than the uploader on label accounts. When the title has no separator we fall
back to the explicit ``artist`` field, preferring it over the uploader/label.
"""
raw_title = (info.get("title") or "").strip()
artist, title = _split_artist_title(raw_title)
if not artist:
artist = (
info.get("artist")
or info.get("creator")
or info.get("uploader")
or info.get("channel")
or ""
).strip()
if not title:
title = raw_title or (info.get("track") or "").strip()
album = (info.get("album") or "").strip() or _album_from_url(
source_url or info.get("webpage_url")
)
return TagMeta(
artist=artist or "Unknown artist",
title=title or "Unknown title",
album=album or None,
)
def read_tags(path: Path) -> TagMeta | None:
"""Return the file's existing tags, or None when it has no usable ones.
"Usable" means both an artist and a title are present that is the signal
that bandcamp already tagged the file properly, in which case we leave it
alone rather than second-guessing it with MusicBrainz.
"""
from mutagen import File as MutagenFile
try:
audio = MutagenFile(str(path), easy=True)
except Exception:
return None
if audio is None or not audio.tags:
return None
artist = _first(audio, "artist")
title = _first(audio, "title")
if not (artist and title):
return None
return TagMeta(
artist=artist,
title=title,
album=_first(audio, "album") or None,
mbid=_first(audio, "musicbrainz_trackid") or None,
)
def write_tags(path: Path, meta: TagMeta) -> bool:
"""Write ``meta`` into the file's tags, overwriting existing ones.
Returns True on success. Any failure (unsupported container, unwritable
file) is logged and swallowed tagging never breaks a download.
"""
from mutagen import File as MutagenFile
try:
audio = MutagenFile(str(path), easy=True)
except Exception as exc: # corrupt/unknown file: leave it alone
log.warning("cannot open %s for tagging: %s", Path(path).name, exc)
return False
if audio is None:
log.info("no mutagen handler for %s; skipping tags", Path(path).name)
return False
try:
if audio.tags is None:
audio.add_tags()
audio["artist"] = meta.artist
audio["title"] = meta.title
if meta.album:
audio["album"] = meta.album
if meta.mbid:
try:
audio["musicbrainz_trackid"] = meta.mbid
except (KeyError, ValueError):
pass # container's easy interface doesn't map this key
audio.save()
return True
except Exception as exc:
log.warning("could not write tags to %s: %s", Path(path).name, exc)
return False
# --- helpers --------------------------------------------------------------
def _first(audio, key: str) -> str:
"""First value of an easy-tag key, stripped; "" when absent."""
values = audio.tags.get(key) or []
return values[0].strip() if values and values[0] else ""
def _split_artist_title(text: str) -> tuple[str | None, str]:
"""Split ``"Artist - Title"`` on the last ``" - "`` (like the shell one-liner).
Returns ``(None, text)`` when there is no separator to split on.
"""
idx = text.rfind(" - ")
if idx == -1:
return None, text.strip()
return text[:idx].strip() or None, text[idx + 3 :].strip()
def _album_from_url(url: str | None) -> str | None:
"""Deslugify a bandcamp ``/album/<slug>`` path into a readable album name.
Only a fallback for when the info dict carries no album; ``/track/`` URLs
yield nothing.
"""
if not url:
return None
m = re.search(r"/album/([^/?#]+)", url)
if not m:
return None
return m.group(1).replace("-", " ").strip().title() or None

View file

@ -1,2 +1,3 @@
httpx>=0.27 httpx>=0.27
yt-dlp>=2024.1 yt-dlp>=2024.1
mutagen>=1.47