Milestone 6: ListenBrainz recommendations provider

Add a third playback source: a ListenBrainz recommendations Atom feed. Each suggestion already carries a MusicBrainz recording MBID, title and artist, so it is keyed directly by MBID (source-agnostic identity, no extra lookup) and resolved to a concrete file — Navidrome search3 first, then a yt-dlp ytsearch1: fallback. - providers/listenbrainz.py: parse the Atom/HTML feed, anti-repeat on the MBID key, resolve Navidrome-then-yt-dlp. Feed may be an http(s) URL or a local path (for testing). - subsonic.py: add search_songs (search3) for resolution. - canonicalizer.py: short-circuit when a Track already has an MBID, so feed-provided MBIDs are trusted and MusicBrainz is not hit. - __main__.py: wire the provider in; register the yt-dlp fetcher as a resolution backend even when the yt-dlp source is off; close providers on shutdown. - config/compose/.env.example: RADIEO_LISTENBRAINZ_URL + weight. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-07-02 19:10:01 +08:00 · 2026-07-02 19:10:01 +08:00 · 66d93e5034
commit 66d93e5034
parent 7e0f08b863
9 changed files with 284 additions and 16 deletions
--- a/ingest/radieo/main.py
+++ b/ingest/radieo/main.py
@ -27,18 +27,19 @@ def _build_pipeline(db: Database):
    cache fallback."""
    providers = []  # list[(provider, weight)]
    fetchers = {}  # backend name -> fetcher
+    subsonic_client = None  # reused by ListenBrainz for resolution

    if config.NAVIDROME_ENABLED:
        from .fetchers.subsonic import SubsonicFetcher
        from .providers.navidrome import NavidromeProvider
        from .subsonic import SubsonicClient

-        client = SubsonicClient(
+        subsonic_client = SubsonicClient(
            config.NAVIDROME_URL, config.NAVIDROME_USER, config.NAVIDROME_PASSWORD
        )
-        provider = NavidromeProvider(client, config.NAVIDROME_PLAYLIST, db)
+        provider = NavidromeProvider(subsonic_client, config.NAVIDROME_PLAYLIST, db)
        providers.append((provider, config.SOURCE_WEIGHTS.get("navidrome", 0)))
-        fetchers["subsonic"] = SubsonicFetcher(client, config.CACHE_DIR)
+        fetchers["subsonic"] = SubsonicFetcher(subsonic_client, config.CACHE_DIR)
        log.info(
            "Navidrome source enabled (playlist=%r, weight=%d)",
            config.NAVIDROME_PLAYLIST,
@ -66,6 +67,36 @@ def _build_pipeline(db: Database):
            config.YTDLP_URLS_FILE.exists(),
        )

+    if config.LISTENBRAINZ_ENABLED and config.SOURCE_WEIGHTS.get("listenbrainz", 0) > 0:
+        from .providers.listenbrainz import ListenBrainzProvider
+
+        # ListenBrainz has no backend of its own: it resolves each suggestion to
+        # Navidrome then yt-dlp. Make sure the yt-dlp fetcher exists as a
+        # resolution target even when the yt-dlp *source* is off.
+        if "ytdlp" not in fetchers:
+            from .fetchers.ytdlp import YtdlpFetcher
+
+            fetchers["ytdlp"] = YtdlpFetcher(config.CACHE_DIR)
+        provider = ListenBrainzProvider(
+            config.LISTENBRAINZ_URL,
+            db,
+            subsonic_client=subsonic_client,
+            ytdlp_enabled="ytdlp" in fetchers,
+        )
+        providers.append((provider, config.SOURCE_WEIGHTS["listenbrainz"]))
+        log.info(
+            "ListenBrainz source enabled (feed=%s, weight=%d, resolve=%s)",
+            config.LISTENBRAINZ_URL,
+            config.SOURCE_WEIGHTS["listenbrainz"],
+            "navidrome+ytdlp" if subsonic_client else "ytdlp",
+        )
+    else:
+        log.info(
+            "ListenBrainz source off (weight=%d, feed set=%s).",
+            config.SOURCE_WEIGHTS.get("listenbrainz", 0),
+            config.LISTENBRAINZ_ENABLED,
+        )
+
    if not providers:
        log.warning("no source active: the stream plays its local cache only.")
    return providers, fetchers
@ -135,6 +166,10 @@ def main() -> None:
    finally:
        queue.stop()
        server.server_close()
+        for provider, _ in providers:
+            close = getattr(provider, "close", None)
+            if callable(close):
+                close()
        canonicalizer.close()
        db.close()

--- a/ingest/radieo/canonicalizer.py
+++ b/ingest/radieo/canonicalizer.py
@ -38,6 +38,9 @@ class Canonicalizer:

    def canonicalize(self, track: Track) -> Track:
        """Return the Track with ``mbid`` filled when resolvable, else unchanged."""
+        if track.mbid:
+            # Already identified (e.g. ListenBrainz ships the recording MBID).
+            return track
        artist_norm = norm_name(track.artist)
        title_norm = norm_name(track.title)
        if not artist_norm or not title_norm:
--- a/ingest/radieo/config.py
+++ b/ingest/radieo/config.py
@ -67,10 +67,20 @@ YTDLP_URLS_FILE = Path(os.environ.get("RADIEO_YTDLP_URLS_FILE", "/config/urls.tx
 # How often to reload the URL list and re-expand container URLs, in seconds.
 YTDLP_REFRESH = float(os.environ.get("RADIEO_YTDLP_REFRESH", "300"))

+# --- ListenBrainz suggestions source ---
+# Atom recommendations feed. May be an http(s) URL (the real syndication feed)
+# or a local file path (for testing with a saved sample). Empty disables it.
+# ListenBrainz only *names* tracks; each is resolved to Navidrome then yt-dlp.
+LISTENBRAINZ_URL = os.environ.get("RADIEO_LISTENBRAINZ_URL", "").strip()
+# How often to reload the feed, in seconds (it refreshes weekly).
+LISTENBRAINZ_REFRESH = float(os.environ.get("RADIEO_LISTENBRAINZ_REFRESH", "3600"))
+LISTENBRAINZ_ENABLED = bool(LISTENBRAINZ_URL)
+
 # --- Source mix (weights) ---
 # Relative odds of drawing from each source. Isolated here on purpose so the
 # mix can later move to a config file. A weight of 0 disables a source.
 SOURCE_WEIGHTS = {
    "navidrome": int(os.environ.get("RADIEO_WEIGHT_NAVIDROME", "3")),
    "ytdlp": int(os.environ.get("RADIEO_WEIGHT_YTDLP", "1")),
+    "listenbrainz": int(os.environ.get("RADIEO_WEIGHT_LISTENBRAINZ", "2")),
 }
--- a/ingest/radieo/providers/listenbrainz.py
+++ b/ingest/radieo/providers/listenbrainz.py
@ -0,0 +1,178 @@
+"""ListenBrainzProvider: play from a ListenBrainz recommendations feed.
+
+The feed is an Atom document whose ``<content>`` is an HTML list of
+recommendations, each a MusicBrainz *recording* link plus a title and artist::
+
+    <a href="https://musicbrainz.org/recording/MBID">Title</a>
+      by <a href="https://listenbrainz.org/artist//…">Artist</a>
+
+So every suggestion already carries a canonical MBID — we set it on the emitted
+Track directly, which gives a source-agnostic identity for free (no MusicBrainz
+lookup needed).
+
+Unlike the other providers, ListenBrainz yields no audio of its own: it only
+*names* tracks. Each pick is therefore **resolved** to a concrete backend —
+Navidrome first (an OpenSubsonic ``search3``), then yt-dlp (a ``ytsearch1:``
+query) as a fallback — so the download layer is unchanged.
+
+``RADIEO_LISTENBRAINZ_URL`` may be an ``http(s)`` URL (the real syndication
+feed) or a local file path (handy for testing with a saved sample).
+"""
+
+import html
+import logging
+import random
+import re
+import time
+import xml.etree.ElementTree as ET
+from pathlib import Path
+
+import httpx
+
+from .. import config
+from ..db import Database
+from ..models import Track, norm_name
+from ..subsonic import SubsonicClient, SubsonicError
+
+log = logging.getLogger("radieo.provider.listenbrainz")
+
+_ATOM = "{http://www.w3.org/2005/Atom}"
+
+# Inside each <li>: the recording link (MBID + title) followed by the artist
+# link (name). Non-greedy gaps pair each recording with its own artist.
+_ITEM_RE = re.compile(
+    r'<a\s+href="https://musicbrainz\.org/recording/([0-9a-fA-F-]+)"\s*>(.*?)</a>'
+    r".*?"
+    r'<a\s+href="https://listenbrainz\.org/artist//[^"]*"\s*>(.*?)</a>',
+    re.DOTALL,
+)
+
+
+class ListenBrainzProvider:
+    name = "listenbrainz"
+
+    def __init__(
+        self,
+        url: str,
+        db: Database,
+        subsonic_client: SubsonicClient | None,
+        ytdlp_enabled: bool,
+    ):
+        self._url = url
+        self._db = db
+        self._subsonic = subsonic_client
+        self._ytdlp_enabled = ytdlp_enabled
+        self._http = httpx.Client(
+            timeout=httpx.Timeout(connect=10.0, read=30.0, write=10.0, pool=10.0),
+            headers={"User-Agent": config.USER_AGENT},
+            follow_redirects=True,
+        )
+        self._recs: list[dict] = []
+        self._loaded_at = 0.0
+
+    # --- feed loading -----------------------------------------------------
+
+    def _recommendations(self) -> list[dict]:
+        now = time.time()
+        if self._recs and now - self._loaded_at < config.LISTENBRAINZ_REFRESH:
+            return self._recs
+        try:
+            xml_text = self._fetch_feed()
+            recs = _parse_feed(xml_text)
+        except (httpx.HTTPError, OSError, ET.ParseError) as exc:
+            log.warning("could not load ListenBrainz feed: %s", exc)
+            return self._recs  # keep whatever we had
+        if recs and recs != self._recs:
+            log.info("loaded %d ListenBrainz recommendation(s)", len(recs))
+        self._recs = recs
+        self._loaded_at = now
+        return self._recs
+
+    def _fetch_feed(self) -> str:
+        if self._url.startswith(("http://", "https://")):
+            resp = self._http.get(self._url)
+            resp.raise_for_status()
+            return resp.text
+        return Path(self._url).read_text(encoding="utf-8")
+
+    # --- resolution -------------------------------------------------------
+
+    def next(self) -> Track | None:
+        recs = self._recommendations()
+        if not recs:
+            return None
+        recent = self._db.recent_keys(config.ANTIREPEAT_WINDOW)
+        pool = [r for r in recs if f"mbid:{r['mbid']}" not in recent] or list(recs)
+        random.shuffle(pool)
+        for rec in pool:
+            track = self._resolve(rec)
+            if track is not None:
+                return track
+        return None
+
+    def _resolve(self, rec: dict) -> Track | None:
+        artist, title, mbid = rec["artist"], rec["title"], rec["mbid"]
+
+        song = self._search_navidrome(artist, title)
+        if song is not None:
+            return Track(
+                backend="subsonic",
+                locator=str(song["id"]),
+                artist=song.get("artist", artist),
+                title=song.get("title", title),
+                origin=self.name,
+                mbid=mbid,
+                source_ext=song.get("suffix"),
+            )
+
+        if self._ytdlp_enabled:
+            return Track(
+                backend="ytdlp",
+                locator=f"ytsearch1:{artist} {title}",
+                artist=artist,
+                title=title,
+                origin=self.name,
+                mbid=mbid,
+            )
+        return None
+
+    def _search_navidrome(self, artist: str, title: str) -> dict | None:
+        if self._subsonic is None:
+            return None
+        try:
+            songs = self._subsonic.search_songs(title)
+        except (SubsonicError, httpx.HTTPError, OSError) as exc:
+            log.warning("Navidrome search failed for %s — %s: %s", artist, title, exc)
+            return None
+        want_a, want_t = norm_name(artist), norm_name(title)
+        for s in songs:
+            if norm_name(s.get("title", "")) != want_t:
+                continue
+            got_a = norm_name(s.get("artist", ""))
+            # Exact or containment either way, to absorb "feat." differences.
+            if got_a == want_a or got_a in want_a or want_a in got_a:
+                return s
+        return None
+
+    def close(self) -> None:
+        self._http.close()
+
+
+def _parse_feed(xml_text: str) -> list[dict]:
+    """Extract ``{mbid, title, artist}`` dicts from the Atom feed, deduped."""
+    root = ET.fromstring(xml_text)
+    seen: set[str] = set()
+    out: list[dict] = []
+    for entry in root.findall(f"{_ATOM}entry"):
+        content = entry.find(f"{_ATOM}content")
+        if content is None or not content.text:
+            continue
+        for m in _ITEM_RE.finditer(content.text):
+            mbid = m.group(1).lower()
+            title = html.unescape(m.group(2)).strip()
+            artist = html.unescape(m.group(3)).strip()
+            if not mbid or not title or not artist or mbid in seen:
+                continue
+            seen.add(mbid)
+            out.append({"mbid": mbid, "title": title, "artist": artist})
+    return out
--- a/ingest/radieo/subsonic.py
+++ b/ingest/radieo/subsonic.py
@ -85,6 +85,17 @@ class SubsonicClient:
        body = self._get_json("getPlaylist", id=playlist_id)
        return body.get("playlist", {}).get("entry", [])

+    def search_songs(self, query: str, count: int = 25) -> list[dict]:
+        """Full-text song search (``search3``). Songs only, no artists/albums."""
+        body = self._get_json(
+            "search3",
+            query=query,
+            songCount=count,
+            artistCount=0,
+            albumCount=0,
+        )
+        return body.get("searchResult3", {}).get("song", [])
+
    def download(self, song_id: str, dest: Path, hint_ext: str | None = None) -> str:
        """Download a song to ``dest``; return the file extension used.