Milestone 6: ListenBrainz recommendations provider

Add a third playback source: a ListenBrainz recommendations Atom feed. Each
suggestion already carries a MusicBrainz recording MBID, title and artist, so
it is keyed directly by MBID (source-agnostic identity, no extra lookup) and
resolved to a concrete file — Navidrome search3 first, then a yt-dlp
ytsearch1: fallback.

- providers/listenbrainz.py: parse the Atom/HTML feed, anti-repeat on the MBID
  key, resolve Navidrome-then-yt-dlp. Feed may be an http(s) URL or a local
  path (for testing).
- subsonic.py: add search_songs (search3) for resolution.
- canonicalizer.py: short-circuit when a Track already has an MBID, so
  feed-provided MBIDs are trusted and MusicBrainz is not hit.
- __main__.py: wire the provider in; register the yt-dlp fetcher as a
  resolution backend even when the yt-dlp source is off; close providers on
  shutdown.
- config/compose/.env.example: RADIEO_LISTENBRAINZ_URL + weight.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
nemunaire 2026-07-02 19:10:01 +08:00
commit 66d93e5034
9 changed files with 284 additions and 16 deletions

View file

@ -27,18 +27,19 @@ def _build_pipeline(db: Database):
cache fallback."""
providers = [] # list[(provider, weight)]
fetchers = {} # backend name -> fetcher
subsonic_client = None # reused by ListenBrainz for resolution
if config.NAVIDROME_ENABLED:
from .fetchers.subsonic import SubsonicFetcher
from .providers.navidrome import NavidromeProvider
from .subsonic import SubsonicClient
client = SubsonicClient(
subsonic_client = SubsonicClient(
config.NAVIDROME_URL, config.NAVIDROME_USER, config.NAVIDROME_PASSWORD
)
provider = NavidromeProvider(client, config.NAVIDROME_PLAYLIST, db)
provider = NavidromeProvider(subsonic_client, config.NAVIDROME_PLAYLIST, db)
providers.append((provider, config.SOURCE_WEIGHTS.get("navidrome", 0)))
fetchers["subsonic"] = SubsonicFetcher(client, config.CACHE_DIR)
fetchers["subsonic"] = SubsonicFetcher(subsonic_client, config.CACHE_DIR)
log.info(
"Navidrome source enabled (playlist=%r, weight=%d)",
config.NAVIDROME_PLAYLIST,
@ -66,6 +67,36 @@ def _build_pipeline(db: Database):
config.YTDLP_URLS_FILE.exists(),
)
if config.LISTENBRAINZ_ENABLED and config.SOURCE_WEIGHTS.get("listenbrainz", 0) > 0:
from .providers.listenbrainz import ListenBrainzProvider
# ListenBrainz has no backend of its own: it resolves each suggestion to
# Navidrome then yt-dlp. Make sure the yt-dlp fetcher exists as a
# resolution target even when the yt-dlp *source* is off.
if "ytdlp" not in fetchers:
from .fetchers.ytdlp import YtdlpFetcher
fetchers["ytdlp"] = YtdlpFetcher(config.CACHE_DIR)
provider = ListenBrainzProvider(
config.LISTENBRAINZ_URL,
db,
subsonic_client=subsonic_client,
ytdlp_enabled="ytdlp" in fetchers,
)
providers.append((provider, config.SOURCE_WEIGHTS["listenbrainz"]))
log.info(
"ListenBrainz source enabled (feed=%s, weight=%d, resolve=%s)",
config.LISTENBRAINZ_URL,
config.SOURCE_WEIGHTS["listenbrainz"],
"navidrome+ytdlp" if subsonic_client else "ytdlp",
)
else:
log.info(
"ListenBrainz source off (weight=%d, feed set=%s).",
config.SOURCE_WEIGHTS.get("listenbrainz", 0),
config.LISTENBRAINZ_ENABLED,
)
if not providers:
log.warning("no source active: the stream plays its local cache only.")
return providers, fetchers
@ -135,6 +166,10 @@ def main() -> None:
finally:
queue.stop()
server.server_close()
for provider, _ in providers:
close = getattr(provider, "close", None)
if callable(close):
close()
canonicalizer.close()
db.close()

View file

@ -38,6 +38,9 @@ class Canonicalizer:
def canonicalize(self, track: Track) -> Track:
"""Return the Track with ``mbid`` filled when resolvable, else unchanged."""
if track.mbid:
# Already identified (e.g. ListenBrainz ships the recording MBID).
return track
artist_norm = norm_name(track.artist)
title_norm = norm_name(track.title)
if not artist_norm or not title_norm:

View file

@ -67,10 +67,20 @@ YTDLP_URLS_FILE = Path(os.environ.get("RADIEO_YTDLP_URLS_FILE", "/config/urls.tx
# How often to reload the URL list and re-expand container URLs, in seconds.
YTDLP_REFRESH = float(os.environ.get("RADIEO_YTDLP_REFRESH", "300"))
# --- ListenBrainz suggestions source ---
# Atom recommendations feed. May be an http(s) URL (the real syndication feed)
# or a local file path (for testing with a saved sample). Empty disables it.
# ListenBrainz only *names* tracks; each is resolved to Navidrome then yt-dlp.
LISTENBRAINZ_URL = os.environ.get("RADIEO_LISTENBRAINZ_URL", "").strip()
# How often to reload the feed, in seconds (it refreshes weekly).
LISTENBRAINZ_REFRESH = float(os.environ.get("RADIEO_LISTENBRAINZ_REFRESH", "3600"))
LISTENBRAINZ_ENABLED = bool(LISTENBRAINZ_URL)
# --- Source mix (weights) ---
# Relative odds of drawing from each source. Isolated here on purpose so the
# mix can later move to a config file. A weight of 0 disables a source.
SOURCE_WEIGHTS = {
"navidrome": int(os.environ.get("RADIEO_WEIGHT_NAVIDROME", "3")),
"ytdlp": int(os.environ.get("RADIEO_WEIGHT_YTDLP", "1")),
"listenbrainz": int(os.environ.get("RADIEO_WEIGHT_LISTENBRAINZ", "2")),
}

View file

@ -0,0 +1,178 @@
"""ListenBrainzProvider: play from a ListenBrainz recommendations feed.
The feed is an Atom document whose ``<content>`` is an HTML list of
recommendations, each a MusicBrainz *recording* link plus a title and artist::
<a href="https://musicbrainz.org/recording/MBID">Title</a>
by <a href="https://listenbrainz.org/artist//…">Artist</a>
So every suggestion already carries a canonical MBID we set it on the emitted
Track directly, which gives a source-agnostic identity for free (no MusicBrainz
lookup needed).
Unlike the other providers, ListenBrainz yields no audio of its own: it only
*names* tracks. Each pick is therefore **resolved** to a concrete backend
Navidrome first (an OpenSubsonic ``search3``), then yt-dlp (a ``ytsearch1:``
query) as a fallback so the download layer is unchanged.
``RADIEO_LISTENBRAINZ_URL`` may be an ``http(s)`` URL (the real syndication
feed) or a local file path (handy for testing with a saved sample).
"""
import html
import logging
import random
import re
import time
import xml.etree.ElementTree as ET
from pathlib import Path
import httpx
from .. import config
from ..db import Database
from ..models import Track, norm_name
from ..subsonic import SubsonicClient, SubsonicError
log = logging.getLogger("radieo.provider.listenbrainz")
_ATOM = "{http://www.w3.org/2005/Atom}"
# Inside each <li>: the recording link (MBID + title) followed by the artist
# link (name). Non-greedy gaps pair each recording with its own artist.
_ITEM_RE = re.compile(
r'<a\s+href="https://musicbrainz\.org/recording/([0-9a-fA-F-]+)"\s*>(.*?)</a>'
r".*?"
r'<a\s+href="https://listenbrainz\.org/artist//[^"]*"\s*>(.*?)</a>',
re.DOTALL,
)
class ListenBrainzProvider:
name = "listenbrainz"
def __init__(
self,
url: str,
db: Database,
subsonic_client: SubsonicClient | None,
ytdlp_enabled: bool,
):
self._url = url
self._db = db
self._subsonic = subsonic_client
self._ytdlp_enabled = ytdlp_enabled
self._http = httpx.Client(
timeout=httpx.Timeout(connect=10.0, read=30.0, write=10.0, pool=10.0),
headers={"User-Agent": config.USER_AGENT},
follow_redirects=True,
)
self._recs: list[dict] = []
self._loaded_at = 0.0
# --- feed loading -----------------------------------------------------
def _recommendations(self) -> list[dict]:
now = time.time()
if self._recs and now - self._loaded_at < config.LISTENBRAINZ_REFRESH:
return self._recs
try:
xml_text = self._fetch_feed()
recs = _parse_feed(xml_text)
except (httpx.HTTPError, OSError, ET.ParseError) as exc:
log.warning("could not load ListenBrainz feed: %s", exc)
return self._recs # keep whatever we had
if recs and recs != self._recs:
log.info("loaded %d ListenBrainz recommendation(s)", len(recs))
self._recs = recs
self._loaded_at = now
return self._recs
def _fetch_feed(self) -> str:
if self._url.startswith(("http://", "https://")):
resp = self._http.get(self._url)
resp.raise_for_status()
return resp.text
return Path(self._url).read_text(encoding="utf-8")
# --- resolution -------------------------------------------------------
def next(self) -> Track | None:
recs = self._recommendations()
if not recs:
return None
recent = self._db.recent_keys(config.ANTIREPEAT_WINDOW)
pool = [r for r in recs if f"mbid:{r['mbid']}" not in recent] or list(recs)
random.shuffle(pool)
for rec in pool:
track = self._resolve(rec)
if track is not None:
return track
return None
def _resolve(self, rec: dict) -> Track | None:
artist, title, mbid = rec["artist"], rec["title"], rec["mbid"]
song = self._search_navidrome(artist, title)
if song is not None:
return Track(
backend="subsonic",
locator=str(song["id"]),
artist=song.get("artist", artist),
title=song.get("title", title),
origin=self.name,
mbid=mbid,
source_ext=song.get("suffix"),
)
if self._ytdlp_enabled:
return Track(
backend="ytdlp",
locator=f"ytsearch1:{artist} {title}",
artist=artist,
title=title,
origin=self.name,
mbid=mbid,
)
return None
def _search_navidrome(self, artist: str, title: str) -> dict | None:
if self._subsonic is None:
return None
try:
songs = self._subsonic.search_songs(title)
except (SubsonicError, httpx.HTTPError, OSError) as exc:
log.warning("Navidrome search failed for %s%s: %s", artist, title, exc)
return None
want_a, want_t = norm_name(artist), norm_name(title)
for s in songs:
if norm_name(s.get("title", "")) != want_t:
continue
got_a = norm_name(s.get("artist", ""))
# Exact or containment either way, to absorb "feat." differences.
if got_a == want_a or got_a in want_a or want_a in got_a:
return s
return None
def close(self) -> None:
self._http.close()
def _parse_feed(xml_text: str) -> list[dict]:
"""Extract ``{mbid, title, artist}`` dicts from the Atom feed, deduped."""
root = ET.fromstring(xml_text)
seen: set[str] = set()
out: list[dict] = []
for entry in root.findall(f"{_ATOM}entry"):
content = entry.find(f"{_ATOM}content")
if content is None or not content.text:
continue
for m in _ITEM_RE.finditer(content.text):
mbid = m.group(1).lower()
title = html.unescape(m.group(2)).strip()
artist = html.unescape(m.group(3)).strip()
if not mbid or not title or not artist or mbid in seen:
continue
seen.add(mbid)
out.append({"mbid": mbid, "title": title, "artist": artist})
return out

View file

@ -85,6 +85,17 @@ class SubsonicClient:
body = self._get_json("getPlaylist", id=playlist_id)
return body.get("playlist", {}).get("entry", [])
def search_songs(self, query: str, count: int = 25) -> list[dict]:
"""Full-text song search (``search3``). Songs only, no artists/albums."""
body = self._get_json(
"search3",
query=query,
songCount=count,
artistCount=0,
albumCount=0,
)
return body.get("searchResult3", {}).get("song", [])
def download(self, song_id: str, dest: Path, hint_ext: str | None = None) -> str:
"""Download a song to ``dest``; return the file extension used.