From 112a4b0c8955adb43e173488ac4af93003230df9 Mon Sep 17 00:00:00 2001 From: Pierre-Olivier Mercier Date: Sat, 4 Jul 2026 17:01:04 +0800 Subject: [PATCH] ingest: bias yt-dlp picks towards a discography's newest releases Bandcamp label/artist pages list releases newest-first, so give the first YTDLP_RECENT_COUNT entries a YTDLP_RECENT_BOOST multiplier when picking, leaning the radio towards fresh music. Flat extraction carries no dates, so list position is used as the recency proxy rather than a real date lookup. The boost is gated on the source URL being a bandcamp discography listing: single /album/ and /track/ pages (whether a URL-file line or one the pick recursed into) list tracks in track order, not by recency, and non-bandcamp sources have unverified ordering, so all of those keep a uniform pick. Co-Authored-By: Claude Opus 4.8 --- README.md | 8 ++++++ ingest/radieo/config.py | 9 ++++++ ingest/radieo/providers/ytdlp.py | 47 +++++++++++++++++++++++++++++++- 3 files changed, 63 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 650d438..e330b68 100644 --- a/README.md +++ b/README.md @@ -91,6 +91,14 @@ Add one URL per line: a single track, or a playlist/album/label/artist page to pick from. The file is mounted read-only, so you can edit it without rebuilding. A missing file just disables the yt-dlp source. +To favour fresh music, set `RADIEO_YTDLP_RECENT_BOOST` (in `.env`) above `1.0`: +when picking from a Bandcamp label/artist/discography page — which lists +releases newest-first — the newest `RADIEO_YTDLP_RECENT_COUNT` releases (default +`5`) get that multiplier on their odds (e.g. `2.0` makes them twice as likely). +The default `1.0` disables it. The boost only applies to such discography +listings; single `/album/` and `/track/` pages and non-Bandcamp sources keep a +uniform pick. + ### 4. ListenBrainz suggestions Point `RADIEO_LISTENBRAINZ_URL` (in `.env`) at your recommendations syndication diff --git a/ingest/radieo/config.py b/ingest/radieo/config.py index 71ee9a4..0f30652 100644 --- a/ingest/radieo/config.py +++ b/ingest/radieo/config.py @@ -105,6 +105,15 @@ SUBSONIC_ENABLED = bool( YTDLP_URLS_FILE = Path(os.environ.get("RADIEO_YTDLP_URLS_FILE", "/config/urls.txt")) # How often to reload the URL list and re-expand container URLs, in seconds. YTDLP_REFRESH = float(os.environ.get("RADIEO_YTDLP_REFRESH", "300")) +# Recency boost. Bandcamp label/artist/discography pages list releases +# newest-first, so the first entries are the freshest. When picking a release +# from such a listing, the newest RADIEO_YTDLP_RECENT_COUNT entries get this +# multiplier on their selection weight, biasing the radio towards new music. +# 1.0 disables the boost (uniform pick); 2.0 makes fresh releases twice as +# likely per slot. Applied only at the top level of a source (a discography): +# once drilled into an album, entry order is track order, not recency. +YTDLP_RECENT_BOOST = float(os.environ.get("RADIEO_YTDLP_RECENT_BOOST", "1.0")) +YTDLP_RECENT_COUNT = int(os.environ.get("RADIEO_YTDLP_RECENT_COUNT", "5")) # --- ListenBrainz suggestions source --- # Atom recommendations feed. May be an http(s) URL (the real syndication feed) diff --git a/ingest/radieo/providers/ytdlp.py b/ingest/radieo/providers/ytdlp.py index 0a8cf33..2904fb0 100644 --- a/ingest/radieo/providers/ytdlp.py +++ b/ingest/radieo/providers/ytdlp.py @@ -12,6 +12,11 @@ letter, so a numeric part before the first ``:`` can only be a weight. Lines without a prefix default to weight ``1``; a weight of ``0`` disables the line. Weights bias the *order* in which sources are tried each pick. +Within a discography (a label/artist page), the newest releases can be given a +selection boost (``YTDLP_RECENT_BOOST``): bandcamp lists releases newest-first, +so the first ``YTDLP_RECENT_COUNT`` entries are favoured, keeping the radio +leaning towards fresh music. + The provider only *resolves* references — it emits ``ytdlp`` tracks whose locator is the chosen media URL. The actual download happens in the matching fetcher (milestone 4). Flat extraction is cached per source URL to avoid @@ -21,6 +26,7 @@ re-hitting the network on every pick. import logging import random import time +import urllib.parse from pathlib import Path from .. import config, tagging @@ -179,7 +185,7 @@ class YtdlpProvider: if len(entries) == 1 and entries[0]["url"] == src_url: return entries[0] # leaf: a track resolves to itself pool = [e for e in entries if e["url"] not in recent] or entries - entry = random.choice(pool) + entry = self._choose(src_url, pool, entries) url = entry["url"] if url == src_url or depth >= self._MAX_EXPAND_DEPTH: return entry @@ -187,6 +193,45 @@ class YtdlpProvider: # Fall back to the entry as-is if that turns up nothing usable. return self._pick_entry(url, recent, depth + 1) or entry + @staticmethod + def _choose(src_url: str, pool: list[dict], entries: list[dict]) -> dict: + """Pick one entry, favouring the newest releases of a discography. + + Bandcamp label/artist/discography pages return releases newest-first, so + the first ``YTDLP_RECENT_COUNT`` entries are the freshest; they get + ``YTDLP_RECENT_BOOST`` times the weight of the rest, biasing selection + towards new music without needing a per-release date lookup (flat + extraction carries no dates). + + The boost only applies when ``src_url`` is such a discography listing + (see ``_boostable``). For a single ``/album/`` or ``/track/`` source — + whether it is a line in the URL file or one the pick recursed into — the + entries are tracks in track order, where position says nothing about + recency, so the pick stays uniform, as it does when the boost is + disabled (``== 1.0``). + """ + boost = config.YTDLP_RECENT_BOOST + if boost == 1.0 or not YtdlpProvider._boostable(src_url): + return random.choice(pool) + newest = {e["url"] for e in entries[: config.YTDLP_RECENT_COUNT]} + weights = [boost if e["url"] in newest else 1.0 for e in pool] + return random.choices(pool, weights=weights, k=1)[0] + + @staticmethod + def _boostable(src_url: str) -> bool: + """True when ``src_url`` lists releases newest-first (a discography). + + The recency boost relies on bandcamp ordering a label/artist/music page + newest-first. That does not hold for a single ``/album/`` or ``/track/`` + page (those list tracks in track order) nor for non-bandcamp sources + whose ordering we have not verified, so the boost is limited to bandcamp + discography URLs. + """ + if not tagging.is_bandcamp({}, src_url): + return False + path = urllib.parse.urlsplit(src_url).path + return "/album/" not in path and "/track/" not in path + def _resolved_tags(self, locator: str): """Corrected identity a previous download wrote into the cached file.