diff --git a/ingest/radieo/__main__.py b/ingest/radieo/__main__.py index a1d03aa..a1a76aa 100644 --- a/ingest/radieo/__main__.py +++ b/ingest/radieo/__main__.py @@ -19,14 +19,18 @@ class _NullCanonicalizer: def canonicalize(self, track): return track + def identify(self, artist, title): + return None, None, None + def close(self): pass -def _build_pipeline(db: Database): +def _build_pipeline(db: Database, canonicalizer): """Return (providers, fetchers). Assembles whichever sources are enabled; when none is, the scheduler yields nothing and the stream plays its local - cache fallback.""" + cache fallback. ``canonicalizer`` is handed to the yt-dlp fetcher so it can + confirm guessed bandcamp tags against MusicBrainz.""" providers = [] # list[(provider, weight)] fetchers = {} # backend name -> fetcher subsonic_client = None # reused by ListenBrainz for resolution @@ -54,9 +58,9 @@ def _build_pipeline(db: Database): from .fetchers.ytdlp import YtdlpFetcher from .providers.ytdlp import YtdlpProvider - provider = YtdlpProvider(config.YTDLP_URLS_FILE, db) + provider = YtdlpProvider(config.YTDLP_URLS_FILE, db, config.CACHE_DIR) providers.append((provider, config.SOURCE_WEIGHTS["ytdlp"])) - fetchers["ytdlp"] = YtdlpFetcher(config.CACHE_DIR) + fetchers["ytdlp"] = YtdlpFetcher(config.CACHE_DIR, canonicalizer) log.info( "yt-dlp source enabled (urls=%s, weight=%d)", config.YTDLP_URLS_FILE, @@ -78,7 +82,7 @@ def _build_pipeline(db: Database): if "ytdlp" not in fetchers: from .fetchers.ytdlp import YtdlpFetcher - fetchers["ytdlp"] = YtdlpFetcher(config.CACHE_DIR) + fetchers["ytdlp"] = YtdlpFetcher(config.CACHE_DIR, canonicalizer) provider = ListenBrainzProvider( config.LISTENBRAINZ_URL, db, @@ -148,7 +152,7 @@ def main() -> None: canonicalizer = _NullCanonicalizer() log.info("Canonicalizer disabled: tracks keyed by (artist, title).") - providers, fetchers = _build_pipeline(db) + providers, fetchers = _build_pipeline(db, canonicalizer) scheduler = Scheduler(providers, canonicalizer, db) queue = TrackQueue(scheduler, fetchers, db) queue.start() diff --git a/ingest/radieo/canonicalizer.py b/ingest/radieo/canonicalizer.py index f9fd348..8ce2ab3 100644 --- a/ingest/radieo/canonicalizer.py +++ b/ingest/radieo/canonicalizer.py @@ -49,15 +49,49 @@ class Canonicalizer: cached, mbid = self._db.get_canonical(artist_norm, title_norm) if not cached: - ok, mbid = self._lookup(track.artist, track.title) + ok, rec = self._search(track.artist, track.title) if ok: # cache hits and genuine misses; skip transient errors + mbid = rec.get("id") if rec else None + log.info( + "%s for %s — %s", + f"MBID {mbid}" if mbid else "no confident MBID", + track.artist, + track.title, + ) self._db.put_canonical(artist_norm, title_norm, mbid) return replace(track, mbid=mbid) if mbid else track + def identify( + self, artist: str, title: str + ) -> tuple[str | None, str | None, str | None]: + """Best-effort MusicBrainz identification for tagging. + + Returns ``(mbid, canonical_artist, canonical_title)``; any field may be + None. Unlike :meth:`canonicalize` it also hands back the recording's + canonical names so a caller can clean up a mis-tagged file. Warms the + MBID cache as a side effect but is not itself cached, so call it at most + once per download. + """ + artist_norm = norm_name(artist) + title_norm = norm_name(title) + if not artist_norm or not title_norm: + return None, None, None + ok, rec = self._search(artist, title) + if not ok or not rec: + return None, None, None + mbid = rec.get("id") + if mbid: + self._db.put_canonical(artist_norm, title_norm, mbid) + return mbid, _artist_credit(rec), rec.get("title") + # --- MusicBrainz ------------------------------------------------------ - def _lookup(self, artist: str, title: str) -> tuple[bool, str | None]: - """Return (ok, mbid). ``ok`` False on a transient error (do not cache).""" + def _search(self, artist: str, title: str) -> tuple[bool, dict | None]: + """Query MusicBrainz for the best recording match above the threshold. + + Returns ``(ok, recording)``; ``ok`` False on a transient error (caller + must not cache), ``recording`` None on a genuine no-confident-match. + """ query = f'artist:"{_escape(artist)}" AND recording:"{_escape(title)}"' self._throttle() try: @@ -72,14 +106,9 @@ class Canonicalizer: return False, None recordings = data.get("recordings") or [] - if recordings: - best = recordings[0] - if int(best.get("score", 0)) >= config.CANONICAL_MIN_SCORE: - mbid = best.get("id") - log.info("MBID %s for %s — %s", mbid, artist, title) - return True, mbid - log.info("no confident MBID for %s — %s", artist, title) - return True, None # genuine miss: cache it + if recordings and int(recordings[0].get("score", 0)) >= config.CANONICAL_MIN_SCORE: + return True, recordings[0] + return True, None def _throttle(self) -> None: with self._rate_lock: @@ -96,3 +125,22 @@ class Canonicalizer: # quoted phrase. def _escape(value: str) -> str: return value.replace("\\", "\\\\").replace('"', '\\"') + + +def _artist_credit(rec: dict) -> str | None: + """Flatten a recording's ``artist-credit`` into a display name. + + MusicBrainz splits collaborations into credited parts joined by phrases + (e.g. ``"A"`` + ``" & "`` + ``"B"``); stitch them back together. + """ + credits = rec.get("artist-credit") or [] + parts: list[str] = [] + for c in credits: + if isinstance(c, str): + parts.append(c) + continue + name = c.get("name") or (c.get("artist") or {}).get("name") or "" + parts.append(name) + parts.append(c.get("joinphrase") or "") + name = "".join(parts).strip() + return name or None diff --git a/ingest/radieo/fetchers/subsonic.py b/ingest/radieo/fetchers/subsonic.py index 52e56fb..c4af567 100644 --- a/ingest/radieo/fetchers/subsonic.py +++ b/ingest/radieo/fetchers/subsonic.py @@ -25,12 +25,12 @@ class SubsonicFetcher: self._client = client self._cache_dir = cache_dir - def fetch(self, track: Track) -> Path: + def fetch(self, track: Track) -> tuple[Path, Track]: stem = f"subsonic-{_SAFE.sub('_', track.locator)}" # Reuse a still-cached copy rather than downloading again. for existing in self._cache_dir.glob(f"{stem}.*"): if not existing.name.endswith(".part"): - return existing + return existing, track tmp = self._cache_dir / f".{uuid4().hex}.part" try: @@ -43,4 +43,5 @@ class SubsonicFetcher: tmp.unlink(missing_ok=True) raise log.info("downloaded %s -> %s", track, dest.name) - return dest + # Subsonic files are already tagged by the library server; pass through. + return dest, track diff --git a/ingest/radieo/fetchers/ytdlp.py b/ingest/radieo/fetchers/ytdlp.py index 79b1d60..95cb534 100644 --- a/ingest/radieo/fetchers/ytdlp.py +++ b/ingest/radieo/fetchers/ytdlp.py @@ -12,27 +12,48 @@ Liquidsoap decodes it via ffmpeg, so no re-encoding is needed. import hashlib import logging +from dataclasses import replace from pathlib import Path from uuid import uuid4 +from .. import tagging from ..models import Track log = logging.getLogger("radieo.fetcher.ytdlp") +def cache_stem(locator: str) -> str: + """Cache filename stem for a yt-dlp locator (shared with the provider).""" + h = hashlib.sha1(locator.encode()).hexdigest()[:16] + return f"ytdlp-{h}" + + +def cached_file(cache_dir: Path, locator: str) -> Path | None: + """The ready (non-.part) cached file for a locator, if one exists.""" + for existing in cache_dir.glob(f"{cache_stem(locator)}.*"): + if not existing.name.endswith(".part"): + return existing + return None + + class YtdlpFetcher: backend = "ytdlp" - def __init__(self, cache_dir: Path): + def __init__(self, cache_dir: Path, canonicalizer): self._cache_dir = cache_dir + # Cross-checks guessed bandcamp tags against MusicBrainz. A + # _NullCanonicalizer stands in when the lookup is disabled, so this is + # always safe to call. + self._canonicalizer = canonicalizer - def fetch(self, track: Track) -> Path: - h = hashlib.sha1(track.locator.encode()).hexdigest()[:16] - stem = f"ytdlp-{h}" - # Reuse a still-cached copy rather than downloading again. - for existing in self._cache_dir.glob(f"{stem}.*"): - if not existing.name.endswith(".part"): - return existing + def fetch(self, track: Track) -> tuple[Path, Track]: + stem = cache_stem(track.locator) + # Reuse a still-cached copy rather than downloading again. The provider + # already read this file's tags into the Track (see YtdlpProvider), so + # its metadata is the corrected one. + existing = cached_file(self._cache_dir, track.locator) + if existing is not None: + return existing, track tmp_base = self._cache_dir / f".{stem}.{uuid4().hex}" opts = { @@ -58,7 +79,56 @@ class YtdlpFetcher: leftover.unlink(missing_ok=True) raise log.info("downloaded %s -> %s", track, dest.name) - return dest + return dest, self._retag(dest, info, track) + + def _retag(self, dest: Path, info: dict, track: Track) -> Track: + """For bandcamp downloads, ensure sane ID3 tags and refine the Track. + + Bandcamp label accounts leave the flat-extracted artist pointing at the + label and many uploads ship untagged. Three cases, in order: + + - the file is *already* tagged (bandcamp did it right): keep those tags, + just adopt them into the Track so the stream annotation matches; + - otherwise search MusicBrainz from the download metadata/filename and, + on a confident match, write its canonical artist/title; + - failing that, fall back to parsing the filename-style title. + + The tags land in the file, which is the single source of truth: the + provider reads them back on the next pick (before the scheduler keys the + track), and a cache-reuse hit inherits them via that same Track. Any + failure leaves the Track untouched. + """ + if not tagging.is_bandcamp(info, track.locator): + return track + try: + existing = tagging.read_tags(dest) + if existing is not None: + log.info("keeping bandcamp tags on %s: %s — %s", + dest.name, existing.artist, existing.title) + meta = existing + else: + guess = tagging.guess_metadata(info, track.source_url) + mbid, mb_artist, mb_title = self._canonicalizer.identify( + guess.artist, guess.title + ) + if mb_artist and mb_title: + meta = tagging.TagMeta(mb_artist, mb_title, guess.album, mbid) + else: # MusicBrainz had nothing: trust the filename + meta = guess + tagging.write_tags(dest, meta) + log.info( + "tagged %s as %s — %s%s", + dest.name, + meta.artist, + meta.title, + f" [{meta.album}]" if meta.album else "", + ) + except Exception: + log.exception("tagging failed for %s", dest.name) + return track + return replace( + track, artist=meta.artist, title=meta.title, mbid=meta.mbid or track.mbid + ) def _downloaded_path(self, info: dict, tmp_base: Path) -> Path: reqs = info.get("requested_downloads") diff --git a/ingest/radieo/providers/ytdlp.py b/ingest/radieo/providers/ytdlp.py index 6ebca85..77f1e3a 100644 --- a/ingest/radieo/providers/ytdlp.py +++ b/ingest/radieo/providers/ytdlp.py @@ -17,8 +17,9 @@ import random import time from pathlib import Path -from .. import config +from .. import config, tagging from ..db import Database +from ..fetchers.ytdlp import cached_file from ..models import Track log = logging.getLogger("radieo.provider.ytdlp") @@ -27,9 +28,10 @@ log = logging.getLogger("radieo.provider.ytdlp") class YtdlpProvider: name = "ytdlp" - def __init__(self, urls_file: Path, db: Database): + def __init__(self, urls_file: Path, db: Database, cache_dir: Path): self._urls_file = urls_file self._db = db + self._cache_dir = cache_dir self._urls: list[str] = [] self._loaded_at = 0.0 # source URL -> (entries, loaded_at); entries are normalized dicts. @@ -83,15 +85,32 @@ class YtdlpProvider: return None pool = [e for e in entries if e["url"] not in recent] or entries entry = random.choice(pool) + locator = entry["url"] + meta = self._resolved_tags(locator) return Track( backend="ytdlp", - locator=entry["url"], - artist=entry.get("artist") or "Unknown artist", - title=entry.get("title") or entry["url"], + locator=locator, + artist=(meta.artist if meta else entry.get("artist")) or "Unknown artist", + title=(meta.title if meta else entry.get("title")) or locator, origin=self.name, - source_url=src_url if src_url != entry["url"] else None, + mbid=meta.mbid if meta else None, + source_url=src_url if src_url != locator else None, ) + def _resolved_tags(self, locator: str): + """Corrected identity a previous download wrote into the cached file. + + The yt-dlp fetcher fixes bandcamp label accounts and writes the real + artist/title/MBID into the file's tags. Reading them back here — before + the scheduler keys and anti-repeat-checks the track — makes selection, + history and cache reuse agree on one identity. Returns None when the + file isn't cached (yet) or carries no usable tags. + """ + if not tagging.is_bandcamp({}, locator): + return None + cached = cached_file(self._cache_dir, locator) + return tagging.read_tags(cached) if cached else None + def _entries_for(self, src_url: str) -> list[dict]: now = time.time() cached = self._expanded.get(src_url) diff --git a/ingest/radieo/queue.py b/ingest/radieo/queue.py index 8ba7447..cb83d0a 100644 --- a/ingest/radieo/queue.py +++ b/ingest/radieo/queue.py @@ -64,7 +64,9 @@ class TrackQueue: log.error("no fetcher for backend %r (%s)", track.backend, track) continue try: - path = fetcher.fetch(track) + # Fetchers may refine the Track's metadata (e.g. correcting a + # bandcamp label account to the real artist), so take it back. + path, track = fetcher.fetch(track) except Exception: log.exception("fetch failed for %s", track) continue diff --git a/ingest/radieo/tagging.py b/ingest/radieo/tagging.py new file mode 100644 index 0000000..7e9a1ce --- /dev/null +++ b/ingest/radieo/tagging.py @@ -0,0 +1,165 @@ +"""Guess and write ID3 tags for freshly downloaded bandcamp tracks. + +The yt-dlp *flat* extraction the provider uses to pick a track is thin: on a +bandcamp **label** account the uploader is the label rather than the performing +artist, and many bandcamp mp3s ship with no ID3 tags at all. The full metadata +yt-dlp gathers when it actually downloads is richer, so this module re-derives +``(artist, title, album)`` from it — trusting the ``"Artist - Title"`` shape of +the track title the way the user's ``id3tag`` one-liner trusts the filename — +and writes the result into the file's tags. + +Everything here is best-effort: the artist is *guessed*, then the caller cross- +checks it against MusicBrainz (:meth:`Canonicalizer.identify`) to adopt a +canonical spelling and MBID when one is found confidently. A failure to open or +tag the file just leaves it untagged; playback is unaffected. +""" + +import logging +import re +from dataclasses import dataclass +from pathlib import Path + +log = logging.getLogger("radieo.tagging") + + +@dataclass +class TagMeta: + artist: str + title: str + album: str | None = None + mbid: str | None = None + + +def is_bandcamp(info: dict, locator: str) -> bool: + """True when the download came from bandcamp (extractor key or host).""" + key = (info.get("extractor_key") or info.get("extractor") or "").lower() + if "bandcamp" in key: + return True + return "bandcamp.com" in (locator or "") + + +def guess_metadata(info: dict, source_url: str | None = None) -> TagMeta: + """Derive tags from a yt-dlp *download* info dict. + + Priority mirrors the user's shell heuristic: the track title usually carries + ``"Artist - Title"`` (split on the *last* ``" - "``), which is more reliable + than the uploader on label accounts. When the title has no separator we fall + back to the explicit ``artist`` field, preferring it over the uploader/label. + """ + raw_title = (info.get("title") or "").strip() + artist, title = _split_artist_title(raw_title) + if not artist: + artist = ( + info.get("artist") + or info.get("creator") + or info.get("uploader") + or info.get("channel") + or "" + ).strip() + if not title: + title = raw_title or (info.get("track") or "").strip() + + album = (info.get("album") or "").strip() or _album_from_url( + source_url or info.get("webpage_url") + ) + return TagMeta( + artist=artist or "Unknown artist", + title=title or "Unknown title", + album=album or None, + ) + + +def read_tags(path: Path) -> TagMeta | None: + """Return the file's existing tags, or None when it has no usable ones. + + "Usable" means both an artist and a title are present — that is the signal + that bandcamp already tagged the file properly, in which case we leave it + alone rather than second-guessing it with MusicBrainz. + """ + from mutagen import File as MutagenFile + + try: + audio = MutagenFile(str(path), easy=True) + except Exception: + return None + if audio is None or not audio.tags: + return None + artist = _first(audio, "artist") + title = _first(audio, "title") + if not (artist and title): + return None + return TagMeta( + artist=artist, + title=title, + album=_first(audio, "album") or None, + mbid=_first(audio, "musicbrainz_trackid") or None, + ) + + +def write_tags(path: Path, meta: TagMeta) -> bool: + """Write ``meta`` into the file's tags, overwriting existing ones. + + Returns True on success. Any failure (unsupported container, unwritable + file) is logged and swallowed — tagging never breaks a download. + """ + from mutagen import File as MutagenFile + + try: + audio = MutagenFile(str(path), easy=True) + except Exception as exc: # corrupt/unknown file: leave it alone + log.warning("cannot open %s for tagging: %s", Path(path).name, exc) + return False + if audio is None: + log.info("no mutagen handler for %s; skipping tags", Path(path).name) + return False + try: + if audio.tags is None: + audio.add_tags() + audio["artist"] = meta.artist + audio["title"] = meta.title + if meta.album: + audio["album"] = meta.album + if meta.mbid: + try: + audio["musicbrainz_trackid"] = meta.mbid + except (KeyError, ValueError): + pass # container's easy interface doesn't map this key + audio.save() + return True + except Exception as exc: + log.warning("could not write tags to %s: %s", Path(path).name, exc) + return False + + +# --- helpers -------------------------------------------------------------- + + +def _first(audio, key: str) -> str: + """First value of an easy-tag key, stripped; "" when absent.""" + values = audio.tags.get(key) or [] + return values[0].strip() if values and values[0] else "" + + +def _split_artist_title(text: str) -> tuple[str | None, str]: + """Split ``"Artist - Title"`` on the last ``" - "`` (like the shell one-liner). + + Returns ``(None, text)`` when there is no separator to split on. + """ + idx = text.rfind(" - ") + if idx == -1: + return None, text.strip() + return text[:idx].strip() or None, text[idx + 3 :].strip() + + +def _album_from_url(url: str | None) -> str | None: + """Deslugify a bandcamp ``/album/`` path into a readable album name. + + Only a fallback for when the info dict carries no album; ``/track/`` URLs + yield nothing. + """ + if not url: + return None + m = re.search(r"/album/([^/?#]+)", url) + if not m: + return None + return m.group(1).replace("-", " ").strip().title() or None diff --git a/ingest/requirements.txt b/ingest/requirements.txt index 59b3f06..a1a66fb 100644 --- a/ingest/requirements.txt +++ b/ingest/requirements.txt @@ -1,2 +1,3 @@ httpx>=0.27 yt-dlp>=2024.1 +mutagen>=1.47