From 66d93e5034ef9ae678cdb9dd61ff1f4dac328528 Mon Sep 17 00:00:00 2001 From: Pierre-Olivier Mercier Date: Thu, 2 Jul 2026 19:10:01 +0800 Subject: [PATCH] Milestone 6: ListenBrainz recommendations provider MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a third playback source: a ListenBrainz recommendations Atom feed. Each suggestion already carries a MusicBrainz recording MBID, title and artist, so it is keyed directly by MBID (source-agnostic identity, no extra lookup) and resolved to a concrete file — Navidrome search3 first, then a yt-dlp ytsearch1: fallback. - providers/listenbrainz.py: parse the Atom/HTML feed, anti-repeat on the MBID key, resolve Navidrome-then-yt-dlp. Feed may be an http(s) URL or a local path (for testing). - subsonic.py: add search_songs (search3) for resolution. - canonicalizer.py: short-circuit when a Track already has an MBID, so feed-provided MBIDs are trusted and MusicBrainz is not hit. - __main__.py: wire the provider in; register the yt-dlp fetcher as a resolution backend even when the yt-dlp source is off; close providers on shutdown. - config/compose/.env.example: RADIEO_LISTENBRAINZ_URL + weight. Co-Authored-By: Claude Opus 4.8 --- .env.example | 8 ++ .gitignore | 3 + README.md | 42 ++++-- docker-compose.yml | 4 + ingest/radieo/__main__.py | 41 +++++- ingest/radieo/canonicalizer.py | 3 + ingest/radieo/config.py | 10 ++ ingest/radieo/providers/listenbrainz.py | 178 ++++++++++++++++++++++++ ingest/radieo/subsonic.py | 11 ++ 9 files changed, 284 insertions(+), 16 deletions(-) create mode 100644 ingest/radieo/providers/listenbrainz.py diff --git a/.env.example b/.env.example index 67fea57..c033ba4 100644 --- a/.env.example +++ b/.env.example @@ -14,10 +14,18 @@ RADIEO_NAVIDROME_PLAYLIST=Radio # La liste d'URL se met dans config/urls.txt (copier config/urls.txt.example). # Rien à mettre ici ; le fichier absent désactive simplement la source. +# --- Source ListenBrainz --- +# Feed Atom de recommandations. URL http(s) du feed de syndication, ou chemin +# local sous /config (ex. /config/recommendations.xml) pour tester. Vide = off. +# ListenBrainz ne fait que *nommer* des morceaux : chacun est résolu vers +# Navidrome puis, à défaut, yt-dlp. +RADIEO_LISTENBRAINZ_URL=https://listenbrainz.org/syndication-feed/user/monuser/recommendations/weekly-exploration + # --- Dosage du mix entre sources (optionnel) --- # Poids relatifs de tirage de chaque source (0 désactive la source). RADIEO_WEIGHT_NAVIDROME=3 RADIEO_WEIGHT_YTDLP=1 +RADIEO_WEIGHT_LISTENBRAINZ=2 # --- Canonicalizer MBID (optionnel) --- # Résout (artiste, titre) -> MBID MusicBrainz pour dédupliquer entre sources. diff --git a/.gitignore b/.gitignore index 9163637..1b6794b 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,9 @@ # Liste de sources yt-dlp personnelle (garder seulement l'exemple) /config/urls.txt +# Échantillon de feed ListenBrainz personnel +/config/recommendations.xml + # État de l'ingestion (base SQLite persistante) /state/ *.db diff --git a/README.md b/README.md index 5e979b4..6d29cfa 100644 --- a/README.md +++ b/README.md @@ -61,18 +61,33 @@ the stream plays whatever is already in `cache/` (the milestone-1/2 behaviour). For the yt-dlp source, list the URLs to draw from in `config/urls.txt` (copy `config/urls.txt.example`). Each line is either a direct track URL or a container URL (playlist, album, label, artist page) from which one track is -picked at random. The relative mix between sources is set by -`RADIEO_WEIGHT_NAVIDROME` / `RADIEO_WEIGHT_YTDLP` (a weight of 0 disables a -source); the file being absent also disables yt-dlp. +picked at random. + +For the ListenBrainz source, set `RADIEO_LISTENBRAINZ_URL` to your +recommendations feed (the Atom syndication URL, e.g. +`https://listenbrainz.org/syndication-feed/user//recommendations/weekly-exploration`, +or a local file path under `config/` for testing). ListenBrainz only *names* +tracks, so each suggestion is resolved to a real file: Navidrome first +(a `search3` lookup), then yt-dlp (`ytsearch1:`) as a fallback. The +MusicBrainz recording MBID that the feed already carries is used as the +track's canonical identity (no extra lookup needed). + +The relative mix between sources is set by `RADIEO_WEIGHT_NAVIDROME` / +`RADIEO_WEIGHT_YTDLP` / `RADIEO_WEIGHT_LISTENBRAINZ` (a weight of 0 disables a +source); an empty URL / missing file also disables the corresponding source. ## Current status -**Milestone 5 — MBID canonicalizer: done.** +**Milestone 6 — ListenBrainz provider: done.** -- Two playback sources feed a weighted scheduler: a Navidrome/OpenSubsonic - playlist and a hand-maintained list of yt-dlp URLs (`config/urls.txt`). - Container URLs (playlist/album/label/artist) are expanded and one track is - drawn at random. +- Three playback sources feed a weighted scheduler: a Navidrome/OpenSubsonic + playlist, a hand-maintained list of yt-dlp URLs (`config/urls.txt`), and a + ListenBrainz recommendations feed. Container URLs (playlist/album/label/artist) + are expanded and one track is drawn at random. +- ListenBrainz suggestions carry a MusicBrainz recording MBID, a title and an + artist; each is resolved to a concrete file (Navidrome `search3` first, then + a yt-dlp `ytsearch1:` fallback) and keyed directly by its MBID — so the same + song is de-duplicated across all three sources for free. - Each track is canonicalized to a MusicBrainz recording MBID (no API key needed; ~1 req/s, best-effort, results cached in SQLite). This gives a source-agnostic identity, so the same song from two sources collapses to one; @@ -92,9 +107,10 @@ source); the file being absent also disables yt-dlp. - HTTP stream served at `http://localhost:8000/radio.mp3` (MP3, 192 kbps), multiple simultaneous listeners supported. -The ListenBrainz suggestion feed comes next. (Known cosmetic quirk: at startup -the fallback logs a few harmless ffmpeg "Invalid data" warnings while probing -non-audio files such as `.gitkeep`; to be quieted in the polish milestone.) +Polish comes next (crossfade tuning, robustness, optional web player, config +file). (Known cosmetic quirk: at startup the fallback logs a few harmless +ffmpeg "Invalid data" warnings while probing non-audio files such as +`.gitkeep`; to be quieted in the polish milestone.) ## Roadmap @@ -107,6 +123,6 @@ non-audio files such as `.gitkeep`; to be quieted in the polish milestone.) weighted mixing between sources. 5. ✅ **Canonicalizer** — MusicBrainz MBID lookup for source-agnostic de-duplication. -6. **ListenBrainz provider** — parse the RSS suggestions feed and resolve each - one to Navidrome or yt-dlp. +6. ✅ **ListenBrainz provider** — parse the recommendations feed and resolve + each suggestion to Navidrome or yt-dlp. 7. **Polish** — crossfade, robustness, optional web player, config file. diff --git a/docker-compose.yml b/docker-compose.yml index 0862ffd..24f1730 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -19,9 +19,13 @@ services: - RADIEO_RETENTION_KEEP=${RADIEO_RETENTION_KEEP:-20} # Source yt-dlp : liste d'URL dans config/urls.txt (créer depuis l'exemple). - RADIEO_YTDLP_URLS_FILE=/config/urls.txt + # Source ListenBrainz : URL du feed de recommandations (ou chemin local + # sous /config, ex. /config/recommendations.xml). Vide désactive la source. + - RADIEO_LISTENBRAINZ_URL=${RADIEO_LISTENBRAINZ_URL:-} # Dosage du mix entre les sources (0 désactive). - RADIEO_WEIGHT_NAVIDROME=${RADIEO_WEIGHT_NAVIDROME:-3} - RADIEO_WEIGHT_YTDLP=${RADIEO_WEIGHT_YTDLP:-1} + - RADIEO_WEIGHT_LISTENBRAINZ=${RADIEO_WEIGHT_LISTENBRAINZ:-2} # Canonicalizer MusicBrainz (identité MBID inter-sources ; sans clé). - RADIEO_CANONICAL_ENABLED=${RADIEO_CANONICAL_ENABLED:-1} - RADIEO_USER_AGENT=${RADIEO_USER_AGENT:-radieo/0.1 (personal music radio)} diff --git a/ingest/radieo/__main__.py b/ingest/radieo/__main__.py index 831946c..7923a9c 100644 --- a/ingest/radieo/__main__.py +++ b/ingest/radieo/__main__.py @@ -27,18 +27,19 @@ def _build_pipeline(db: Database): cache fallback.""" providers = [] # list[(provider, weight)] fetchers = {} # backend name -> fetcher + subsonic_client = None # reused by ListenBrainz for resolution if config.NAVIDROME_ENABLED: from .fetchers.subsonic import SubsonicFetcher from .providers.navidrome import NavidromeProvider from .subsonic import SubsonicClient - client = SubsonicClient( + subsonic_client = SubsonicClient( config.NAVIDROME_URL, config.NAVIDROME_USER, config.NAVIDROME_PASSWORD ) - provider = NavidromeProvider(client, config.NAVIDROME_PLAYLIST, db) + provider = NavidromeProvider(subsonic_client, config.NAVIDROME_PLAYLIST, db) providers.append((provider, config.SOURCE_WEIGHTS.get("navidrome", 0))) - fetchers["subsonic"] = SubsonicFetcher(client, config.CACHE_DIR) + fetchers["subsonic"] = SubsonicFetcher(subsonic_client, config.CACHE_DIR) log.info( "Navidrome source enabled (playlist=%r, weight=%d)", config.NAVIDROME_PLAYLIST, @@ -66,6 +67,36 @@ def _build_pipeline(db: Database): config.YTDLP_URLS_FILE.exists(), ) + if config.LISTENBRAINZ_ENABLED and config.SOURCE_WEIGHTS.get("listenbrainz", 0) > 0: + from .providers.listenbrainz import ListenBrainzProvider + + # ListenBrainz has no backend of its own: it resolves each suggestion to + # Navidrome then yt-dlp. Make sure the yt-dlp fetcher exists as a + # resolution target even when the yt-dlp *source* is off. + if "ytdlp" not in fetchers: + from .fetchers.ytdlp import YtdlpFetcher + + fetchers["ytdlp"] = YtdlpFetcher(config.CACHE_DIR) + provider = ListenBrainzProvider( + config.LISTENBRAINZ_URL, + db, + subsonic_client=subsonic_client, + ytdlp_enabled="ytdlp" in fetchers, + ) + providers.append((provider, config.SOURCE_WEIGHTS["listenbrainz"])) + log.info( + "ListenBrainz source enabled (feed=%s, weight=%d, resolve=%s)", + config.LISTENBRAINZ_URL, + config.SOURCE_WEIGHTS["listenbrainz"], + "navidrome+ytdlp" if subsonic_client else "ytdlp", + ) + else: + log.info( + "ListenBrainz source off (weight=%d, feed set=%s).", + config.SOURCE_WEIGHTS.get("listenbrainz", 0), + config.LISTENBRAINZ_ENABLED, + ) + if not providers: log.warning("no source active: the stream plays its local cache only.") return providers, fetchers @@ -135,6 +166,10 @@ def main() -> None: finally: queue.stop() server.server_close() + for provider, _ in providers: + close = getattr(provider, "close", None) + if callable(close): + close() canonicalizer.close() db.close() diff --git a/ingest/radieo/canonicalizer.py b/ingest/radieo/canonicalizer.py index bdcbf05..4ce0373 100644 --- a/ingest/radieo/canonicalizer.py +++ b/ingest/radieo/canonicalizer.py @@ -38,6 +38,9 @@ class Canonicalizer: def canonicalize(self, track: Track) -> Track: """Return the Track with ``mbid`` filled when resolvable, else unchanged.""" + if track.mbid: + # Already identified (e.g. ListenBrainz ships the recording MBID). + return track artist_norm = norm_name(track.artist) title_norm = norm_name(track.title) if not artist_norm or not title_norm: diff --git a/ingest/radieo/config.py b/ingest/radieo/config.py index 402dcda..55cb6e0 100644 --- a/ingest/radieo/config.py +++ b/ingest/radieo/config.py @@ -67,10 +67,20 @@ YTDLP_URLS_FILE = Path(os.environ.get("RADIEO_YTDLP_URLS_FILE", "/config/urls.tx # How often to reload the URL list and re-expand container URLs, in seconds. YTDLP_REFRESH = float(os.environ.get("RADIEO_YTDLP_REFRESH", "300")) +# --- ListenBrainz suggestions source --- +# Atom recommendations feed. May be an http(s) URL (the real syndication feed) +# or a local file path (for testing with a saved sample). Empty disables it. +# ListenBrainz only *names* tracks; each is resolved to Navidrome then yt-dlp. +LISTENBRAINZ_URL = os.environ.get("RADIEO_LISTENBRAINZ_URL", "").strip() +# How often to reload the feed, in seconds (it refreshes weekly). +LISTENBRAINZ_REFRESH = float(os.environ.get("RADIEO_LISTENBRAINZ_REFRESH", "3600")) +LISTENBRAINZ_ENABLED = bool(LISTENBRAINZ_URL) + # --- Source mix (weights) --- # Relative odds of drawing from each source. Isolated here on purpose so the # mix can later move to a config file. A weight of 0 disables a source. SOURCE_WEIGHTS = { "navidrome": int(os.environ.get("RADIEO_WEIGHT_NAVIDROME", "3")), "ytdlp": int(os.environ.get("RADIEO_WEIGHT_YTDLP", "1")), + "listenbrainz": int(os.environ.get("RADIEO_WEIGHT_LISTENBRAINZ", "2")), } diff --git a/ingest/radieo/providers/listenbrainz.py b/ingest/radieo/providers/listenbrainz.py new file mode 100644 index 0000000..4772484 --- /dev/null +++ b/ingest/radieo/providers/listenbrainz.py @@ -0,0 +1,178 @@ +"""ListenBrainzProvider: play from a ListenBrainz recommendations feed. + +The feed is an Atom document whose ```` is an HTML list of +recommendations, each a MusicBrainz *recording* link plus a title and artist:: + + Title + by Artist + +So every suggestion already carries a canonical MBID — we set it on the emitted +Track directly, which gives a source-agnostic identity for free (no MusicBrainz +lookup needed). + +Unlike the other providers, ListenBrainz yields no audio of its own: it only +*names* tracks. Each pick is therefore **resolved** to a concrete backend — +Navidrome first (an OpenSubsonic ``search3``), then yt-dlp (a ``ytsearch1:`` +query) as a fallback — so the download layer is unchanged. + +``RADIEO_LISTENBRAINZ_URL`` may be an ``http(s)`` URL (the real syndication +feed) or a local file path (handy for testing with a saved sample). +""" + +import html +import logging +import random +import re +import time +import xml.etree.ElementTree as ET +from pathlib import Path + +import httpx + +from .. import config +from ..db import Database +from ..models import Track, norm_name +from ..subsonic import SubsonicClient, SubsonicError + +log = logging.getLogger("radieo.provider.listenbrainz") + +_ATOM = "{http://www.w3.org/2005/Atom}" + +# Inside each
  • : the recording link (MBID + title) followed by the artist +# link (name). Non-greedy gaps pair each recording with its own artist. +_ITEM_RE = re.compile( + r'(.*?)' + r".*?" + r'(.*?)', + re.DOTALL, +) + + +class ListenBrainzProvider: + name = "listenbrainz" + + def __init__( + self, + url: str, + db: Database, + subsonic_client: SubsonicClient | None, + ytdlp_enabled: bool, + ): + self._url = url + self._db = db + self._subsonic = subsonic_client + self._ytdlp_enabled = ytdlp_enabled + self._http = httpx.Client( + timeout=httpx.Timeout(connect=10.0, read=30.0, write=10.0, pool=10.0), + headers={"User-Agent": config.USER_AGENT}, + follow_redirects=True, + ) + self._recs: list[dict] = [] + self._loaded_at = 0.0 + + # --- feed loading ----------------------------------------------------- + + def _recommendations(self) -> list[dict]: + now = time.time() + if self._recs and now - self._loaded_at < config.LISTENBRAINZ_REFRESH: + return self._recs + try: + xml_text = self._fetch_feed() + recs = _parse_feed(xml_text) + except (httpx.HTTPError, OSError, ET.ParseError) as exc: + log.warning("could not load ListenBrainz feed: %s", exc) + return self._recs # keep whatever we had + if recs and recs != self._recs: + log.info("loaded %d ListenBrainz recommendation(s)", len(recs)) + self._recs = recs + self._loaded_at = now + return self._recs + + def _fetch_feed(self) -> str: + if self._url.startswith(("http://", "https://")): + resp = self._http.get(self._url) + resp.raise_for_status() + return resp.text + return Path(self._url).read_text(encoding="utf-8") + + # --- resolution ------------------------------------------------------- + + def next(self) -> Track | None: + recs = self._recommendations() + if not recs: + return None + recent = self._db.recent_keys(config.ANTIREPEAT_WINDOW) + pool = [r for r in recs if f"mbid:{r['mbid']}" not in recent] or list(recs) + random.shuffle(pool) + for rec in pool: + track = self._resolve(rec) + if track is not None: + return track + return None + + def _resolve(self, rec: dict) -> Track | None: + artist, title, mbid = rec["artist"], rec["title"], rec["mbid"] + + song = self._search_navidrome(artist, title) + if song is not None: + return Track( + backend="subsonic", + locator=str(song["id"]), + artist=song.get("artist", artist), + title=song.get("title", title), + origin=self.name, + mbid=mbid, + source_ext=song.get("suffix"), + ) + + if self._ytdlp_enabled: + return Track( + backend="ytdlp", + locator=f"ytsearch1:{artist} {title}", + artist=artist, + title=title, + origin=self.name, + mbid=mbid, + ) + return None + + def _search_navidrome(self, artist: str, title: str) -> dict | None: + if self._subsonic is None: + return None + try: + songs = self._subsonic.search_songs(title) + except (SubsonicError, httpx.HTTPError, OSError) as exc: + log.warning("Navidrome search failed for %s — %s: %s", artist, title, exc) + return None + want_a, want_t = norm_name(artist), norm_name(title) + for s in songs: + if norm_name(s.get("title", "")) != want_t: + continue + got_a = norm_name(s.get("artist", "")) + # Exact or containment either way, to absorb "feat." differences. + if got_a == want_a or got_a in want_a or want_a in got_a: + return s + return None + + def close(self) -> None: + self._http.close() + + +def _parse_feed(xml_text: str) -> list[dict]: + """Extract ``{mbid, title, artist}`` dicts from the Atom feed, deduped.""" + root = ET.fromstring(xml_text) + seen: set[str] = set() + out: list[dict] = [] + for entry in root.findall(f"{_ATOM}entry"): + content = entry.find(f"{_ATOM}content") + if content is None or not content.text: + continue + for m in _ITEM_RE.finditer(content.text): + mbid = m.group(1).lower() + title = html.unescape(m.group(2)).strip() + artist = html.unescape(m.group(3)).strip() + if not mbid or not title or not artist or mbid in seen: + continue + seen.add(mbid) + out.append({"mbid": mbid, "title": title, "artist": artist}) + return out diff --git a/ingest/radieo/subsonic.py b/ingest/radieo/subsonic.py index b5365ae..76a3157 100644 --- a/ingest/radieo/subsonic.py +++ b/ingest/radieo/subsonic.py @@ -85,6 +85,17 @@ class SubsonicClient: body = self._get_json("getPlaylist", id=playlist_id) return body.get("playlist", {}).get("entry", []) + def search_songs(self, query: str, count: int = 25) -> list[dict]: + """Full-text song search (``search3``). Songs only, no artists/albums.""" + body = self._get_json( + "search3", + query=query, + songCount=count, + artistCount=0, + albumCount=0, + ) + return body.get("searchResult3", {}).get("song", []) + def download(self, song_id: str, dest: Path, hint_ext: str | None = None) -> str: """Download a song to ``dest``; return the file extension used.