From 66d93e5034ef9ae678cdb9dd61ff1f4dac328528 Mon Sep 17 00:00:00 2001
From: Pierre-Olivier Mercier <nemunaire@nemunai.re>
Date: Thu, 2 Jul 2026 19:10:01 +0800
Subject: [PATCH] Milestone 6: ListenBrainz recommendations provider
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a third playback source: a ListenBrainz recommendations Atom feed. Each
suggestion already carries a MusicBrainz recording MBID, title and artist, so
it is keyed directly by MBID (source-agnostic identity, no extra lookup) and
resolved to a concrete file — Navidrome search3 first, then a yt-dlp
ytsearch1: fallback.

- providers/listenbrainz.py: parse the Atom/HTML feed, anti-repeat on the MBID
  key, resolve Navidrome-then-yt-dlp. Feed may be an http(s) URL or a local
  path (for testing).
- subsonic.py: add search_songs (search3) for resolution.
- canonicalizer.py: short-circuit when a Track already has an MBID, so
  feed-provided MBIDs are trusted and MusicBrainz is not hit.
- __main__.py: wire the provider in; register the yt-dlp fetcher as a
  resolution backend even when the yt-dlp source is off; close providers on
  shutdown.
- config/compose/.env.example: RADIEO_LISTENBRAINZ_URL + weight.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .env.example                            |   8 ++
 .gitignore                              |   3 +
 README.md                               |  42 ++++--
 docker-compose.yml                      |   4 +
 ingest/radieo/__main__.py               |  41 +++++-
 ingest/radieo/canonicalizer.py          |   3 +
 ingest/radieo/config.py                 |  10 ++
 ingest/radieo/providers/listenbrainz.py | 178 ++++++++++++++++++++++++
 ingest/radieo/subsonic.py               |  11 ++
 9 files changed, 284 insertions(+), 16 deletions(-)
 create mode 100644 ingest/radieo/providers/listenbrainz.py

diff --git a/.env.example b/.env.example
index 67fea57..c033ba4 100644
--- a/.env.example
+++ b/.env.example
@@ -14,10 +14,18 @@ RADIEO_NAVIDROME_PLAYLIST=Radio
 # La liste d'URL se met dans config/urls.txt (copier config/urls.txt.example).
 # Rien à mettre ici ; le fichier absent désactive simplement la source.
 
+# --- Source ListenBrainz ---
+# Feed Atom de recommandations. URL http(s) du feed de syndication, ou chemin
+# local sous /config (ex. /config/recommendations.xml) pour tester. Vide = off.
+# ListenBrainz ne fait que *nommer* des morceaux : chacun est résolu vers
+# Navidrome puis, à défaut, yt-dlp.
+RADIEO_LISTENBRAINZ_URL=https://listenbrainz.org/syndication-feed/user/monuser/recommendations/weekly-exploration
+
 # --- Dosage du mix entre sources (optionnel) ---
 # Poids relatifs de tirage de chaque source (0 désactive la source).
 RADIEO_WEIGHT_NAVIDROME=3
 RADIEO_WEIGHT_YTDLP=1
+RADIEO_WEIGHT_LISTENBRAINZ=2
 
 # --- Canonicalizer MBID (optionnel) ---
 # Résout (artiste, titre) -> MBID MusicBrainz pour dédupliquer entre sources.
diff --git a/.gitignore b/.gitignore
index 9163637..1b6794b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,9 @@
 # Liste de sources yt-dlp personnelle (garder seulement l'exemple)
 /config/urls.txt
 
+# Échantillon de feed ListenBrainz personnel
+/config/recommendations.xml
+
 # État de l'ingestion (base SQLite persistante)
 /state/
 *.db
diff --git a/README.md b/README.md
index 5e979b4..6d29cfa 100644
--- a/README.md
+++ b/README.md
@@ -61,18 +61,33 @@ the stream plays whatever is already in `cache/` (the milestone-1/2 behaviour).
 For the yt-dlp source, list the URLs to draw from in `config/urls.txt` (copy
 `config/urls.txt.example`). Each line is either a direct track URL or a
 container URL (playlist, album, label, artist page) from which one track is
-picked at random. The relative mix between sources is set by
-`RADIEO_WEIGHT_NAVIDROME` / `RADIEO_WEIGHT_YTDLP` (a weight of 0 disables a
-source); the file being absent also disables yt-dlp.
+picked at random.
+
+For the ListenBrainz source, set `RADIEO_LISTENBRAINZ_URL` to your
+recommendations feed (the Atom syndication URL, e.g.
+`https://listenbrainz.org/syndication-feed/user/<you>/recommendations/weekly-exploration`,
+or a local file path under `config/` for testing). ListenBrainz only *names*
+tracks, so each suggestion is resolved to a real file: Navidrome first
+(a `search3` lookup), then yt-dlp (`ytsearch1:`) as a fallback. The
+MusicBrainz recording MBID that the feed already carries is used as the
+track's canonical identity (no extra lookup needed).
+
+The relative mix between sources is set by `RADIEO_WEIGHT_NAVIDROME` /
+`RADIEO_WEIGHT_YTDLP` / `RADIEO_WEIGHT_LISTENBRAINZ` (a weight of 0 disables a
+source); an empty URL / missing file also disables the corresponding source.
 
 ## Current status
 
-**Milestone 5 — MBID canonicalizer: done.**
+**Milestone 6 — ListenBrainz provider: done.**
 
-- Two playback sources feed a weighted scheduler: a Navidrome/OpenSubsonic
-  playlist and a hand-maintained list of yt-dlp URLs (`config/urls.txt`).
-  Container URLs (playlist/album/label/artist) are expanded and one track is
-  drawn at random.
+- Three playback sources feed a weighted scheduler: a Navidrome/OpenSubsonic
+  playlist, a hand-maintained list of yt-dlp URLs (`config/urls.txt`), and a
+  ListenBrainz recommendations feed. Container URLs (playlist/album/label/artist)
+  are expanded and one track is drawn at random.
+- ListenBrainz suggestions carry a MusicBrainz recording MBID, a title and an
+  artist; each is resolved to a concrete file (Navidrome `search3` first, then
+  a yt-dlp `ytsearch1:` fallback) and keyed directly by its MBID — so the same
+  song is de-duplicated across all three sources for free.
 - Each track is canonicalized to a MusicBrainz recording MBID (no API key
   needed; ~1 req/s, best-effort, results cached in SQLite). This gives a
   source-agnostic identity, so the same song from two sources collapses to one;
@@ -92,9 +107,10 @@ source); the file being absent also disables yt-dlp.
 - HTTP stream served at `http://localhost:8000/radio.mp3` (MP3, 192 kbps),
   multiple simultaneous listeners supported.
 
-The ListenBrainz suggestion feed comes next. (Known cosmetic quirk: at startup
-the fallback logs a few harmless ffmpeg "Invalid data" warnings while probing
-non-audio files such as `.gitkeep`; to be quieted in the polish milestone.)
+Polish comes next (crossfade tuning, robustness, optional web player, config
+file). (Known cosmetic quirk: at startup the fallback logs a few harmless
+ffmpeg "Invalid data" warnings while probing non-audio files such as
+`.gitkeep`; to be quieted in the polish milestone.)
 
 ## Roadmap
 
@@ -107,6 +123,6 @@ non-audio files such as `.gitkeep`; to be quieted in the polish milestone.)
    weighted mixing between sources.
 5. ✅ **Canonicalizer** — MusicBrainz MBID lookup for source-agnostic
    de-duplication.
-6. **ListenBrainz provider** — parse the RSS suggestions feed and resolve each
-   one to Navidrome or yt-dlp.
+6. ✅ **ListenBrainz provider** — parse the recommendations feed and resolve
+   each suggestion to Navidrome or yt-dlp.
 7. **Polish** — crossfade, robustness, optional web player, config file.
diff --git a/docker-compose.yml b/docker-compose.yml
index 0862ffd..24f1730 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -19,9 +19,13 @@ services:
       - RADIEO_RETENTION_KEEP=${RADIEO_RETENTION_KEEP:-20}
       # Source yt-dlp : liste d'URL dans config/urls.txt (créer depuis l'exemple).
       - RADIEO_YTDLP_URLS_FILE=/config/urls.txt
+      # Source ListenBrainz : URL du feed de recommandations (ou chemin local
+      # sous /config, ex. /config/recommendations.xml). Vide désactive la source.
+      - RADIEO_LISTENBRAINZ_URL=${RADIEO_LISTENBRAINZ_URL:-}
       # Dosage du mix entre les sources (0 désactive).
       - RADIEO_WEIGHT_NAVIDROME=${RADIEO_WEIGHT_NAVIDROME:-3}
       - RADIEO_WEIGHT_YTDLP=${RADIEO_WEIGHT_YTDLP:-1}
+      - RADIEO_WEIGHT_LISTENBRAINZ=${RADIEO_WEIGHT_LISTENBRAINZ:-2}
       # Canonicalizer MusicBrainz (identité MBID inter-sources ; sans clé).
       - RADIEO_CANONICAL_ENABLED=${RADIEO_CANONICAL_ENABLED:-1}
       - RADIEO_USER_AGENT=${RADIEO_USER_AGENT:-radieo/0.1 (personal music radio)}
diff --git a/ingest/radieo/__main__.py b/ingest/radieo/__main__.py
index 831946c..7923a9c 100644
--- a/ingest/radieo/__main__.py
+++ b/ingest/radieo/__main__.py
@@ -27,18 +27,19 @@ def _build_pipeline(db: Database):
     cache fallback."""
     providers = []  # list[(provider, weight)]
     fetchers = {}  # backend name -> fetcher
+    subsonic_client = None  # reused by ListenBrainz for resolution
 
     if config.NAVIDROME_ENABLED:
         from .fetchers.subsonic import SubsonicFetcher
         from .providers.navidrome import NavidromeProvider
         from .subsonic import SubsonicClient
 
-        client = SubsonicClient(
+        subsonic_client = SubsonicClient(
             config.NAVIDROME_URL, config.NAVIDROME_USER, config.NAVIDROME_PASSWORD
         )
-        provider = NavidromeProvider(client, config.NAVIDROME_PLAYLIST, db)
+        provider = NavidromeProvider(subsonic_client, config.NAVIDROME_PLAYLIST, db)
         providers.append((provider, config.SOURCE_WEIGHTS.get("navidrome", 0)))
-        fetchers["subsonic"] = SubsonicFetcher(client, config.CACHE_DIR)
+        fetchers["subsonic"] = SubsonicFetcher(subsonic_client, config.CACHE_DIR)
         log.info(
             "Navidrome source enabled (playlist=%r, weight=%d)",
             config.NAVIDROME_PLAYLIST,
@@ -66,6 +67,36 @@ def _build_pipeline(db: Database):
             config.YTDLP_URLS_FILE.exists(),
         )
 
+    if config.LISTENBRAINZ_ENABLED and config.SOURCE_WEIGHTS.get("listenbrainz", 0) > 0:
+        from .providers.listenbrainz import ListenBrainzProvider
+
+        # ListenBrainz has no backend of its own: it resolves each suggestion to
+        # Navidrome then yt-dlp. Make sure the yt-dlp fetcher exists as a
+        # resolution target even when the yt-dlp *source* is off.
+        if "ytdlp" not in fetchers:
+            from .fetchers.ytdlp import YtdlpFetcher
+
+            fetchers["ytdlp"] = YtdlpFetcher(config.CACHE_DIR)
+        provider = ListenBrainzProvider(
+            config.LISTENBRAINZ_URL,
+            db,
+            subsonic_client=subsonic_client,
+            ytdlp_enabled="ytdlp" in fetchers,
+        )
+        providers.append((provider, config.SOURCE_WEIGHTS["listenbrainz"]))
+        log.info(
+            "ListenBrainz source enabled (feed=%s, weight=%d, resolve=%s)",
+            config.LISTENBRAINZ_URL,
+            config.SOURCE_WEIGHTS["listenbrainz"],
+            "navidrome+ytdlp" if subsonic_client else "ytdlp",
+        )
+    else:
+        log.info(
+            "ListenBrainz source off (weight=%d, feed set=%s).",
+            config.SOURCE_WEIGHTS.get("listenbrainz", 0),
+            config.LISTENBRAINZ_ENABLED,
+        )
+
     if not providers:
         log.warning("no source active: the stream plays its local cache only.")
     return providers, fetchers
@@ -135,6 +166,10 @@ def main() -> None:
     finally:
         queue.stop()
         server.server_close()
+        for provider, _ in providers:
+            close = getattr(provider, "close", None)
+            if callable(close):
+                close()
         canonicalizer.close()
         db.close()
 
diff --git a/ingest/radieo/canonicalizer.py b/ingest/radieo/canonicalizer.py
index bdcbf05..4ce0373 100644
--- a/ingest/radieo/canonicalizer.py
+++ b/ingest/radieo/canonicalizer.py
@@ -38,6 +38,9 @@ class Canonicalizer:
 
     def canonicalize(self, track: Track) -> Track:
         """Return the Track with ``mbid`` filled when resolvable, else unchanged."""
+        if track.mbid:
+            # Already identified (e.g. ListenBrainz ships the recording MBID).
+            return track
         artist_norm = norm_name(track.artist)
         title_norm = norm_name(track.title)
         if not artist_norm or not title_norm:
diff --git a/ingest/radieo/config.py b/ingest/radieo/config.py
index 402dcda..55cb6e0 100644
--- a/ingest/radieo/config.py
+++ b/ingest/radieo/config.py
@@ -67,10 +67,20 @@ YTDLP_URLS_FILE = Path(os.environ.get("RADIEO_YTDLP_URLS_FILE", "/config/urls.tx
 # How often to reload the URL list and re-expand container URLs, in seconds.
 YTDLP_REFRESH = float(os.environ.get("RADIEO_YTDLP_REFRESH", "300"))
 
+# --- ListenBrainz suggestions source ---
+# Atom recommendations feed. May be an http(s) URL (the real syndication feed)
+# or a local file path (for testing with a saved sample). Empty disables it.
+# ListenBrainz only *names* tracks; each is resolved to Navidrome then yt-dlp.
+LISTENBRAINZ_URL = os.environ.get("RADIEO_LISTENBRAINZ_URL", "").strip()
+# How often to reload the feed, in seconds (it refreshes weekly).
+LISTENBRAINZ_REFRESH = float(os.environ.get("RADIEO_LISTENBRAINZ_REFRESH", "3600"))
+LISTENBRAINZ_ENABLED = bool(LISTENBRAINZ_URL)
+
 # --- Source mix (weights) ---
 # Relative odds of drawing from each source. Isolated here on purpose so the
 # mix can later move to a config file. A weight of 0 disables a source.
 SOURCE_WEIGHTS = {
     "navidrome": int(os.environ.get("RADIEO_WEIGHT_NAVIDROME", "3")),
     "ytdlp": int(os.environ.get("RADIEO_WEIGHT_YTDLP", "1")),
+    "listenbrainz": int(os.environ.get("RADIEO_WEIGHT_LISTENBRAINZ", "2")),
 }
diff --git a/ingest/radieo/providers/listenbrainz.py b/ingest/radieo/providers/listenbrainz.py
new file mode 100644
index 0000000..4772484
--- /dev/null
+++ b/ingest/radieo/providers/listenbrainz.py
@@ -0,0 +1,178 @@
+"""ListenBrainzProvider: play from a ListenBrainz recommendations feed.
+
+The feed is an Atom document whose ``<content>`` is an HTML list of
+recommendations, each a MusicBrainz *recording* link plus a title and artist::
+
+    <a href="https://musicbrainz.org/recording/MBID">Title</a>
+      by <a href="https://listenbrainz.org/artist//…">Artist</a>
+
+So every suggestion already carries a canonical MBID — we set it on the emitted
+Track directly, which gives a source-agnostic identity for free (no MusicBrainz
+lookup needed).
+
+Unlike the other providers, ListenBrainz yields no audio of its own: it only
+*names* tracks. Each pick is therefore **resolved** to a concrete backend —
+Navidrome first (an OpenSubsonic ``search3``), then yt-dlp (a ``ytsearch1:``
+query) as a fallback — so the download layer is unchanged.
+
+``RADIEO_LISTENBRAINZ_URL`` may be an ``http(s)`` URL (the real syndication
+feed) or a local file path (handy for testing with a saved sample).
+"""
+
+import html
+import logging
+import random
+import re
+import time
+import xml.etree.ElementTree as ET
+from pathlib import Path
+
+import httpx
+
+from .. import config
+from ..db import Database
+from ..models import Track, norm_name
+from ..subsonic import SubsonicClient, SubsonicError
+
+log = logging.getLogger("radieo.provider.listenbrainz")
+
+_ATOM = "{http://www.w3.org/2005/Atom}"
+
+# Inside each <li>: the recording link (MBID + title) followed by the artist
+# link (name). Non-greedy gaps pair each recording with its own artist.
+_ITEM_RE = re.compile(
+    r'<a\s+href="https://musicbrainz\.org/recording/([0-9a-fA-F-]+)"\s*>(.*?)</a>'
+    r".*?"
+    r'<a\s+href="https://listenbrainz\.org/artist//[^"]*"\s*>(.*?)</a>',
+    re.DOTALL,
+)
+
+
+class ListenBrainzProvider:
+    name = "listenbrainz"
+
+    def __init__(
+        self,
+        url: str,
+        db: Database,
+        subsonic_client: SubsonicClient | None,
+        ytdlp_enabled: bool,
+    ):
+        self._url = url
+        self._db = db
+        self._subsonic = subsonic_client
+        self._ytdlp_enabled = ytdlp_enabled
+        self._http = httpx.Client(
+            timeout=httpx.Timeout(connect=10.0, read=30.0, write=10.0, pool=10.0),
+            headers={"User-Agent": config.USER_AGENT},
+            follow_redirects=True,
+        )
+        self._recs: list[dict] = []
+        self._loaded_at = 0.0
+
+    # --- feed loading -----------------------------------------------------
+
+    def _recommendations(self) -> list[dict]:
+        now = time.time()
+        if self._recs and now - self._loaded_at < config.LISTENBRAINZ_REFRESH:
+            return self._recs
+        try:
+            xml_text = self._fetch_feed()
+            recs = _parse_feed(xml_text)
+        except (httpx.HTTPError, OSError, ET.ParseError) as exc:
+            log.warning("could not load ListenBrainz feed: %s", exc)
+            return self._recs  # keep whatever we had
+        if recs and recs != self._recs:
+            log.info("loaded %d ListenBrainz recommendation(s)", len(recs))
+        self._recs = recs
+        self._loaded_at = now
+        return self._recs
+
+    def _fetch_feed(self) -> str:
+        if self._url.startswith(("http://", "https://")):
+            resp = self._http.get(self._url)
+            resp.raise_for_status()
+            return resp.text
+        return Path(self._url).read_text(encoding="utf-8")
+
+    # --- resolution -------------------------------------------------------
+
+    def next(self) -> Track | None:
+        recs = self._recommendations()
+        if not recs:
+            return None
+        recent = self._db.recent_keys(config.ANTIREPEAT_WINDOW)
+        pool = [r for r in recs if f"mbid:{r['mbid']}" not in recent] or list(recs)
+        random.shuffle(pool)
+        for rec in pool:
+            track = self._resolve(rec)
+            if track is not None:
+                return track
+        return None
+
+    def _resolve(self, rec: dict) -> Track | None:
+        artist, title, mbid = rec["artist"], rec["title"], rec["mbid"]
+
+        song = self._search_navidrome(artist, title)
+        if song is not None:
+            return Track(
+                backend="subsonic",
+                locator=str(song["id"]),
+                artist=song.get("artist", artist),
+                title=song.get("title", title),
+                origin=self.name,
+                mbid=mbid,
+                source_ext=song.get("suffix"),
+            )
+
+        if self._ytdlp_enabled:
+            return Track(
+                backend="ytdlp",
+                locator=f"ytsearch1:{artist} {title}",
+                artist=artist,
+                title=title,
+                origin=self.name,
+                mbid=mbid,
+            )
+        return None
+
+    def _search_navidrome(self, artist: str, title: str) -> dict | None:
+        if self._subsonic is None:
+            return None
+        try:
+            songs = self._subsonic.search_songs(title)
+        except (SubsonicError, httpx.HTTPError, OSError) as exc:
+            log.warning("Navidrome search failed for %s — %s: %s", artist, title, exc)
+            return None
+        want_a, want_t = norm_name(artist), norm_name(title)
+        for s in songs:
+            if norm_name(s.get("title", "")) != want_t:
+                continue
+            got_a = norm_name(s.get("artist", ""))
+            # Exact or containment either way, to absorb "feat." differences.
+            if got_a == want_a or got_a in want_a or want_a in got_a:
+                return s
+        return None
+
+    def close(self) -> None:
+        self._http.close()
+
+
+def _parse_feed(xml_text: str) -> list[dict]:
+    """Extract ``{mbid, title, artist}`` dicts from the Atom feed, deduped."""
+    root = ET.fromstring(xml_text)
+    seen: set[str] = set()
+    out: list[dict] = []
+    for entry in root.findall(f"{_ATOM}entry"):
+        content = entry.find(f"{_ATOM}content")
+        if content is None or not content.text:
+            continue
+        for m in _ITEM_RE.finditer(content.text):
+            mbid = m.group(1).lower()
+            title = html.unescape(m.group(2)).strip()
+            artist = html.unescape(m.group(3)).strip()
+            if not mbid or not title or not artist or mbid in seen:
+                continue
+            seen.add(mbid)
+            out.append({"mbid": mbid, "title": title, "artist": artist})
+    return out
diff --git a/ingest/radieo/subsonic.py b/ingest/radieo/subsonic.py
index b5365ae..76a3157 100644
--- a/ingest/radieo/subsonic.py
+++ b/ingest/radieo/subsonic.py
@@ -85,6 +85,17 @@ class SubsonicClient:
         body = self._get_json("getPlaylist", id=playlist_id)
         return body.get("playlist", {}).get("entry", [])
 
+    def search_songs(self, query: str, count: int = 25) -> list[dict]:
+        """Full-text song search (``search3``). Songs only, no artists/albums."""
+        body = self._get_json(
+            "search3",
+            query=query,
+            songCount=count,
+            artistCount=0,
+            albumCount=0,
+        )
+        return body.get("searchResult3", {}).get("song", [])
+
     def download(self, song_id: str, dest: Path, hint_ext: str | None = None) -> str:
         """Download a song to ``dest``; return the file extension used.