ingest: tag bandcamp downloads and correct label-account artists
All checks were successful
continuous-integration/drone/push Build is passing

yt-dlp's flat extraction mis-attributes bandcamp uploads: on a label
account the artist becomes the label, and many files ship untagged. After
a download, guess (artist, title, album) from the richer download metadata
and the "Artist - Title" filename shape, confirm against MusicBrainz, and
write ID3 tags into the file. Existing tags are respected; MusicBrainz is
primary, filename parsing only a fallback.

The file's tags are the single source of truth: the provider reads them
back on the next pick, so the corrected identity (incl. MBID) is in place
before the scheduler keys and anti-repeat-checks the track, and cache
reuse inherits it too.

Fetchers now return (path, Track) so the corrected metadata flows back.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
nemunaire 2026-07-03 11:01:24 +08:00
commit fbdb2d6bb3
8 changed files with 346 additions and 36 deletions

View file

@ -19,14 +19,18 @@ class _NullCanonicalizer:
def canonicalize(self, track):
return track
def identify(self, artist, title):
return None, None, None
def close(self):
pass
def _build_pipeline(db: Database):
def _build_pipeline(db: Database, canonicalizer):
"""Return (providers, fetchers). Assembles whichever sources are enabled;
when none is, the scheduler yields nothing and the stream plays its local
cache fallback."""
cache fallback. ``canonicalizer`` is handed to the yt-dlp fetcher so it can
confirm guessed bandcamp tags against MusicBrainz."""
providers = [] # list[(provider, weight)]
fetchers = {} # backend name -> fetcher
subsonic_client = None # reused by ListenBrainz for resolution
@ -54,9 +58,9 @@ def _build_pipeline(db: Database):
from .fetchers.ytdlp import YtdlpFetcher
from .providers.ytdlp import YtdlpProvider
provider = YtdlpProvider(config.YTDLP_URLS_FILE, db)
provider = YtdlpProvider(config.YTDLP_URLS_FILE, db, config.CACHE_DIR)
providers.append((provider, config.SOURCE_WEIGHTS["ytdlp"]))
fetchers["ytdlp"] = YtdlpFetcher(config.CACHE_DIR)
fetchers["ytdlp"] = YtdlpFetcher(config.CACHE_DIR, canonicalizer)
log.info(
"yt-dlp source enabled (urls=%s, weight=%d)",
config.YTDLP_URLS_FILE,
@ -78,7 +82,7 @@ def _build_pipeline(db: Database):
if "ytdlp" not in fetchers:
from .fetchers.ytdlp import YtdlpFetcher
fetchers["ytdlp"] = YtdlpFetcher(config.CACHE_DIR)
fetchers["ytdlp"] = YtdlpFetcher(config.CACHE_DIR, canonicalizer)
provider = ListenBrainzProvider(
config.LISTENBRAINZ_URL,
db,
@ -148,7 +152,7 @@ def main() -> None:
canonicalizer = _NullCanonicalizer()
log.info("Canonicalizer disabled: tracks keyed by (artist, title).")
providers, fetchers = _build_pipeline(db)
providers, fetchers = _build_pipeline(db, canonicalizer)
scheduler = Scheduler(providers, canonicalizer, db)
queue = TrackQueue(scheduler, fetchers, db)
queue.start()

View file

@ -49,15 +49,49 @@ class Canonicalizer:
cached, mbid = self._db.get_canonical(artist_norm, title_norm)
if not cached:
ok, mbid = self._lookup(track.artist, track.title)
ok, rec = self._search(track.artist, track.title)
if ok: # cache hits and genuine misses; skip transient errors
mbid = rec.get("id") if rec else None
log.info(
"%s for %s%s",
f"MBID {mbid}" if mbid else "no confident MBID",
track.artist,
track.title,
)
self._db.put_canonical(artist_norm, title_norm, mbid)
return replace(track, mbid=mbid) if mbid else track
def identify(
self, artist: str, title: str
) -> tuple[str | None, str | None, str | None]:
"""Best-effort MusicBrainz identification for tagging.
Returns ``(mbid, canonical_artist, canonical_title)``; any field may be
None. Unlike :meth:`canonicalize` it also hands back the recording's
canonical names so a caller can clean up a mis-tagged file. Warms the
MBID cache as a side effect but is not itself cached, so call it at most
once per download.
"""
artist_norm = norm_name(artist)
title_norm = norm_name(title)
if not artist_norm or not title_norm:
return None, None, None
ok, rec = self._search(artist, title)
if not ok or not rec:
return None, None, None
mbid = rec.get("id")
if mbid:
self._db.put_canonical(artist_norm, title_norm, mbid)
return mbid, _artist_credit(rec), rec.get("title")
# --- MusicBrainz ------------------------------------------------------
def _lookup(self, artist: str, title: str) -> tuple[bool, str | None]:
"""Return (ok, mbid). ``ok`` False on a transient error (do not cache)."""
def _search(self, artist: str, title: str) -> tuple[bool, dict | None]:
"""Query MusicBrainz for the best recording match above the threshold.
Returns ``(ok, recording)``; ``ok`` False on a transient error (caller
must not cache), ``recording`` None on a genuine no-confident-match.
"""
query = f'artist:"{_escape(artist)}" AND recording:"{_escape(title)}"'
self._throttle()
try:
@ -72,14 +106,9 @@ class Canonicalizer:
return False, None
recordings = data.get("recordings") or []
if recordings:
best = recordings[0]
if int(best.get("score", 0)) >= config.CANONICAL_MIN_SCORE:
mbid = best.get("id")
log.info("MBID %s for %s%s", mbid, artist, title)
return True, mbid
log.info("no confident MBID for %s%s", artist, title)
return True, None # genuine miss: cache it
if recordings and int(recordings[0].get("score", 0)) >= config.CANONICAL_MIN_SCORE:
return True, recordings[0]
return True, None
def _throttle(self) -> None:
with self._rate_lock:
@ -96,3 +125,22 @@ class Canonicalizer:
# quoted phrase.
def _escape(value: str) -> str:
return value.replace("\\", "\\\\").replace('"', '\\"')
def _artist_credit(rec: dict) -> str | None:
"""Flatten a recording's ``artist-credit`` into a display name.
MusicBrainz splits collaborations into credited parts joined by phrases
(e.g. ``"A"`` + ``" & "`` + ``"B"``); stitch them back together.
"""
credits = rec.get("artist-credit") or []
parts: list[str] = []
for c in credits:
if isinstance(c, str):
parts.append(c)
continue
name = c.get("name") or (c.get("artist") or {}).get("name") or ""
parts.append(name)
parts.append(c.get("joinphrase") or "")
name = "".join(parts).strip()
return name or None

View file

@ -25,12 +25,12 @@ class SubsonicFetcher:
self._client = client
self._cache_dir = cache_dir
def fetch(self, track: Track) -> Path:
def fetch(self, track: Track) -> tuple[Path, Track]:
stem = f"subsonic-{_SAFE.sub('_', track.locator)}"
# Reuse a still-cached copy rather than downloading again.
for existing in self._cache_dir.glob(f"{stem}.*"):
if not existing.name.endswith(".part"):
return existing
return existing, track
tmp = self._cache_dir / f".{uuid4().hex}.part"
try:
@ -43,4 +43,5 @@ class SubsonicFetcher:
tmp.unlink(missing_ok=True)
raise
log.info("downloaded %s -> %s", track, dest.name)
return dest
# Subsonic files are already tagged by the library server; pass through.
return dest, track

View file

@ -12,27 +12,48 @@ Liquidsoap decodes it via ffmpeg, so no re-encoding is needed.
import hashlib
import logging
from dataclasses import replace
from pathlib import Path
from uuid import uuid4
from .. import tagging
from ..models import Track
log = logging.getLogger("radieo.fetcher.ytdlp")
def cache_stem(locator: str) -> str:
"""Cache filename stem for a yt-dlp locator (shared with the provider)."""
h = hashlib.sha1(locator.encode()).hexdigest()[:16]
return f"ytdlp-{h}"
def cached_file(cache_dir: Path, locator: str) -> Path | None:
"""The ready (non-.part) cached file for a locator, if one exists."""
for existing in cache_dir.glob(f"{cache_stem(locator)}.*"):
if not existing.name.endswith(".part"):
return existing
return None
class YtdlpFetcher:
backend = "ytdlp"
def __init__(self, cache_dir: Path):
def __init__(self, cache_dir: Path, canonicalizer):
self._cache_dir = cache_dir
# Cross-checks guessed bandcamp tags against MusicBrainz. A
# _NullCanonicalizer stands in when the lookup is disabled, so this is
# always safe to call.
self._canonicalizer = canonicalizer
def fetch(self, track: Track) -> Path:
h = hashlib.sha1(track.locator.encode()).hexdigest()[:16]
stem = f"ytdlp-{h}"
# Reuse a still-cached copy rather than downloading again.
for existing in self._cache_dir.glob(f"{stem}.*"):
if not existing.name.endswith(".part"):
return existing
def fetch(self, track: Track) -> tuple[Path, Track]:
stem = cache_stem(track.locator)
# Reuse a still-cached copy rather than downloading again. The provider
# already read this file's tags into the Track (see YtdlpProvider), so
# its metadata is the corrected one.
existing = cached_file(self._cache_dir, track.locator)
if existing is not None:
return existing, track
tmp_base = self._cache_dir / f".{stem}.{uuid4().hex}"
opts = {
@ -58,7 +79,56 @@ class YtdlpFetcher:
leftover.unlink(missing_ok=True)
raise
log.info("downloaded %s -> %s", track, dest.name)
return dest
return dest, self._retag(dest, info, track)
def _retag(self, dest: Path, info: dict, track: Track) -> Track:
"""For bandcamp downloads, ensure sane ID3 tags and refine the Track.
Bandcamp label accounts leave the flat-extracted artist pointing at the
label and many uploads ship untagged. Three cases, in order:
- the file is *already* tagged (bandcamp did it right): keep those tags,
just adopt them into the Track so the stream annotation matches;
- otherwise search MusicBrainz from the download metadata/filename and,
on a confident match, write its canonical artist/title;
- failing that, fall back to parsing the filename-style title.
The tags land in the file, which is the single source of truth: the
provider reads them back on the next pick (before the scheduler keys the
track), and a cache-reuse hit inherits them via that same Track. Any
failure leaves the Track untouched.
"""
if not tagging.is_bandcamp(info, track.locator):
return track
try:
existing = tagging.read_tags(dest)
if existing is not None:
log.info("keeping bandcamp tags on %s: %s%s",
dest.name, existing.artist, existing.title)
meta = existing
else:
guess = tagging.guess_metadata(info, track.source_url)
mbid, mb_artist, mb_title = self._canonicalizer.identify(
guess.artist, guess.title
)
if mb_artist and mb_title:
meta = tagging.TagMeta(mb_artist, mb_title, guess.album, mbid)
else: # MusicBrainz had nothing: trust the filename
meta = guess
tagging.write_tags(dest, meta)
log.info(
"tagged %s as %s%s%s",
dest.name,
meta.artist,
meta.title,
f" [{meta.album}]" if meta.album else "",
)
except Exception:
log.exception("tagging failed for %s", dest.name)
return track
return replace(
track, artist=meta.artist, title=meta.title, mbid=meta.mbid or track.mbid
)
def _downloaded_path(self, info: dict, tmp_base: Path) -> Path:
reqs = info.get("requested_downloads")

View file

@ -17,8 +17,9 @@ import random
import time
from pathlib import Path
from .. import config
from .. import config, tagging
from ..db import Database
from ..fetchers.ytdlp import cached_file
from ..models import Track
log = logging.getLogger("radieo.provider.ytdlp")
@ -27,9 +28,10 @@ log = logging.getLogger("radieo.provider.ytdlp")
class YtdlpProvider:
name = "ytdlp"
def __init__(self, urls_file: Path, db: Database):
def __init__(self, urls_file: Path, db: Database, cache_dir: Path):
self._urls_file = urls_file
self._db = db
self._cache_dir = cache_dir
self._urls: list[str] = []
self._loaded_at = 0.0
# source URL -> (entries, loaded_at); entries are normalized dicts.
@ -83,15 +85,32 @@ class YtdlpProvider:
return None
pool = [e for e in entries if e["url"] not in recent] or entries
entry = random.choice(pool)
locator = entry["url"]
meta = self._resolved_tags(locator)
return Track(
backend="ytdlp",
locator=entry["url"],
artist=entry.get("artist") or "Unknown artist",
title=entry.get("title") or entry["url"],
locator=locator,
artist=(meta.artist if meta else entry.get("artist")) or "Unknown artist",
title=(meta.title if meta else entry.get("title")) or locator,
origin=self.name,
source_url=src_url if src_url != entry["url"] else None,
mbid=meta.mbid if meta else None,
source_url=src_url if src_url != locator else None,
)
def _resolved_tags(self, locator: str):
"""Corrected identity a previous download wrote into the cached file.
The yt-dlp fetcher fixes bandcamp label accounts and writes the real
artist/title/MBID into the file's tags. Reading them back here — before
the scheduler keys and anti-repeat-checks the track makes selection,
history and cache reuse agree on one identity. Returns None when the
file isn't cached (yet) or carries no usable tags.
"""
if not tagging.is_bandcamp({}, locator):
return None
cached = cached_file(self._cache_dir, locator)
return tagging.read_tags(cached) if cached else None
def _entries_for(self, src_url: str) -> list[dict]:
now = time.time()
cached = self._expanded.get(src_url)

View file

@ -64,7 +64,9 @@ class TrackQueue:
log.error("no fetcher for backend %r (%s)", track.backend, track)
continue
try:
path = fetcher.fetch(track)
# Fetchers may refine the Track's metadata (e.g. correcting a
# bandcamp label account to the real artist), so take it back.
path, track = fetcher.fetch(track)
except Exception:
log.exception("fetch failed for %s", track)
continue

165
ingest/radieo/tagging.py Normal file
View file

@ -0,0 +1,165 @@
"""Guess and write ID3 tags for freshly downloaded bandcamp tracks.
The yt-dlp *flat* extraction the provider uses to pick a track is thin: on a
bandcamp **label** account the uploader is the label rather than the performing
artist, and many bandcamp mp3s ship with no ID3 tags at all. The full metadata
yt-dlp gathers when it actually downloads is richer, so this module re-derives
``(artist, title, album)`` from it trusting the ``"Artist - Title"`` shape of
the track title the way the user's ``id3tag`` one-liner trusts the filename —
and writes the result into the file's tags.
Everything here is best-effort: the artist is *guessed*, then the caller cross-
checks it against MusicBrainz (:meth:`Canonicalizer.identify`) to adopt a
canonical spelling and MBID when one is found confidently. A failure to open or
tag the file just leaves it untagged; playback is unaffected.
"""
import logging
import re
from dataclasses import dataclass
from pathlib import Path
log = logging.getLogger("radieo.tagging")
@dataclass
class TagMeta:
artist: str
title: str
album: str | None = None
mbid: str | None = None
def is_bandcamp(info: dict, locator: str) -> bool:
"""True when the download came from bandcamp (extractor key or host)."""
key = (info.get("extractor_key") or info.get("extractor") or "").lower()
if "bandcamp" in key:
return True
return "bandcamp.com" in (locator or "")
def guess_metadata(info: dict, source_url: str | None = None) -> TagMeta:
"""Derive tags from a yt-dlp *download* info dict.
Priority mirrors the user's shell heuristic: the track title usually carries
``"Artist - Title"`` (split on the *last* ``" - "``), which is more reliable
than the uploader on label accounts. When the title has no separator we fall
back to the explicit ``artist`` field, preferring it over the uploader/label.
"""
raw_title = (info.get("title") or "").strip()
artist, title = _split_artist_title(raw_title)
if not artist:
artist = (
info.get("artist")
or info.get("creator")
or info.get("uploader")
or info.get("channel")
or ""
).strip()
if not title:
title = raw_title or (info.get("track") or "").strip()
album = (info.get("album") or "").strip() or _album_from_url(
source_url or info.get("webpage_url")
)
return TagMeta(
artist=artist or "Unknown artist",
title=title or "Unknown title",
album=album or None,
)
def read_tags(path: Path) -> TagMeta | None:
"""Return the file's existing tags, or None when it has no usable ones.
"Usable" means both an artist and a title are present that is the signal
that bandcamp already tagged the file properly, in which case we leave it
alone rather than second-guessing it with MusicBrainz.
"""
from mutagen import File as MutagenFile
try:
audio = MutagenFile(str(path), easy=True)
except Exception:
return None
if audio is None or not audio.tags:
return None
artist = _first(audio, "artist")
title = _first(audio, "title")
if not (artist and title):
return None
return TagMeta(
artist=artist,
title=title,
album=_first(audio, "album") or None,
mbid=_first(audio, "musicbrainz_trackid") or None,
)
def write_tags(path: Path, meta: TagMeta) -> bool:
"""Write ``meta`` into the file's tags, overwriting existing ones.
Returns True on success. Any failure (unsupported container, unwritable
file) is logged and swallowed tagging never breaks a download.
"""
from mutagen import File as MutagenFile
try:
audio = MutagenFile(str(path), easy=True)
except Exception as exc: # corrupt/unknown file: leave it alone
log.warning("cannot open %s for tagging: %s", Path(path).name, exc)
return False
if audio is None:
log.info("no mutagen handler for %s; skipping tags", Path(path).name)
return False
try:
if audio.tags is None:
audio.add_tags()
audio["artist"] = meta.artist
audio["title"] = meta.title
if meta.album:
audio["album"] = meta.album
if meta.mbid:
try:
audio["musicbrainz_trackid"] = meta.mbid
except (KeyError, ValueError):
pass # container's easy interface doesn't map this key
audio.save()
return True
except Exception as exc:
log.warning("could not write tags to %s: %s", Path(path).name, exc)
return False
# --- helpers --------------------------------------------------------------
def _first(audio, key: str) -> str:
"""First value of an easy-tag key, stripped; "" when absent."""
values = audio.tags.get(key) or []
return values[0].strip() if values and values[0] else ""
def _split_artist_title(text: str) -> tuple[str | None, str]:
"""Split ``"Artist - Title"`` on the last ``" - "`` (like the shell one-liner).
Returns ``(None, text)`` when there is no separator to split on.
"""
idx = text.rfind(" - ")
if idx == -1:
return None, text.strip()
return text[:idx].strip() or None, text[idx + 3 :].strip()
def _album_from_url(url: str | None) -> str | None:
"""Deslugify a bandcamp ``/album/<slug>`` path into a readable album name.
Only a fallback for when the info dict carries no album; ``/track/`` URLs
yield nothing.
"""
if not url:
return None
m = re.search(r"/album/([^/?#]+)", url)
if not m:
return None
return m.group(1).replace("-", " ").strip().title() or None

View file

@ -1,2 +1,3 @@
httpx>=0.27
yt-dlp>=2024.1
mutagen>=1.47