From 5dd50f37b79a76d1b4f805359dd0e15a32217a2a Mon Sep 17 00:00:00 2001 From: Pierre-Olivier Mercier Date: Sat, 4 Jul 2026 16:16:04 +0800 Subject: [PATCH] ingest: weight yt-dlp sources with an optional WEIGHT:URL prefix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Source lines may now start with a numeric weight (e.g. 3:https://…) to bias how often each is drawn. The prefix is unambiguous since a URL scheme can never be numeric; unprefixed lines default to weight 1 and a weight of 0 disables the line. Selection uses Efraimidis-Spirakis weighted sampling without replacement, preserving the fall-through to the next source when one fails. Co-Authored-By: Claude Opus 4.8 --- config/urls.txt.example | 12 +++++-- ingest/radieo/config.py | 3 +- ingest/radieo/providers/ytdlp.py | 59 ++++++++++++++++++++++++++++---- 3 files changed, 64 insertions(+), 10 deletions(-) diff --git a/config/urls.txt.example b/config/urls.txt.example index 0d69ff0..99c5714 100644 --- a/config/urls.txt.example +++ b/config/urls.txt.example @@ -7,8 +7,16 @@ # -> radieo y pioche un morceau au hasard à chaque tour, # en évitant ceux joués récemment. # +# Poids (optionnel) : préfixer la ligne par `POIDS:URL` pour ajuster la +# fréquence de tirage d'une source. Sans préfixe, le poids vaut 1. Le poids est +# relatif aux autres lignes ; un poids de 0 désactive la ligne. La lecture est +# non ambiguë : un schéma d'URL ne peut jamais être un nombre. +# 3:https://… -> tirée 3× plus souvent qu'une ligne de poids 1 +# 0.5:https://… -> tirée 2× moins souvent +# 0:https://… -> désactivée +# # Exemples (à remplacer par les tiens) : # https://www.youtube.com/watch?v=dQw4w9WgXcQ -# https://soundcloud.com/artiste/un-morceau +# 2:https://soundcloud.com/artiste/un-morceau # https://artiste.bandcamp.com/album/un-album -# https://www.youtube.com/playlist?list=PLxxxxxxxx +# 3:https://www.youtube.com/playlist?list=PLxxxxxxxx diff --git a/ingest/radieo/config.py b/ingest/radieo/config.py index 4965b55..c7c204f 100644 --- a/ingest/radieo/config.py +++ b/ingest/radieo/config.py @@ -66,7 +66,8 @@ SUBSONIC_ENABLED = bool( # --- yt-dlp source --- # Plain text file (one URL per line, '#' comments) mounted into the container. # May contain direct track URLs or container URLs (playlist/album/label/artist), -# from which one track is picked at random. Left absent disables the source. +# from which one track is picked at random. A line may carry an optional +# 'WEIGHT:URL' prefix to bias how often it is drawn. Left absent disables it. YTDLP_URLS_FILE = Path(os.environ.get("RADIEO_YTDLP_URLS_FILE", "/config/urls.txt")) # How often to reload the URL list and re-expand container URLs, in seconds. YTDLP_REFRESH = float(os.environ.get("RADIEO_YTDLP_REFRESH", "300")) diff --git a/ingest/radieo/providers/ytdlp.py b/ingest/radieo/providers/ytdlp.py index c106e1d..0a8cf33 100644 --- a/ingest/radieo/providers/ytdlp.py +++ b/ingest/radieo/providers/ytdlp.py @@ -6,6 +6,12 @@ be either a direct track URL or a *container* URL (playlist, album, label, artist page); container URLs are expanded with a flat yt-dlp extraction and one entry is picked at random, honouring the anti-repeat window. +A line may carry an optional relative weight as a ``WEIGHT:URL`` prefix (e.g. +``3:https://…``). The parse is unambiguous because URL schemes must start with a +letter, so a numeric part before the first ``:`` can only be a weight. Lines +without a prefix default to weight ``1``; a weight of ``0`` disables the line. +Weights bias the *order* in which sources are tried each pick. + The provider only *resolves* references — it emits ``ytdlp`` tracks whose locator is the chosen media URL. The actual download happens in the matching fetcher (milestone 4). Flat extraction is cached per source URL to avoid @@ -37,7 +43,8 @@ class YtdlpProvider: self._urls_file = urls_file self._db = db self._cache_dir = cache_dir - self._urls: list[str] = [] + # (url, weight) pairs; weight > 0, disabled lines are dropped on load. + self._urls: list[tuple[str, float]] = [] self._loaded_at = 0.0 # source URL -> (entries, loaded_at); entries are normalized dicts. self._expanded: dict[str, tuple[list[dict], float]] = {} @@ -57,13 +64,39 @@ class YtdlpProvider: urls = [] for line in lines: line = line.strip() - if line and not line.startswith("#"): - urls.append(line) + if not line or line.startswith("#"): + continue + url, weight = self._parse_line(line) + if weight > 0: + urls.append((url, weight)) if urls != self._urls: log.info("loaded %d yt-dlp source URL(s)", len(urls)) self._urls = urls self._loaded_at = now + @staticmethod + def _parse_line(line: str) -> tuple[str, float]: + """Split an optional ``WEIGHT:URL`` prefix off a source line. + + The part before the first ``:`` is a weight only if it parses as a + number; a URL scheme can never be purely numeric (RFC 3986 requires a + leading letter), so there is no ambiguity. Unprefixed lines default to + weight ``1``; a non-positive or malformed weight is logged and the line + is disabled (weight ``0``). + """ + head, sep, rest = line.partition(":") + if not sep: + return line, 1.0 + try: + weight = float(head) + except ValueError: + return line, 1.0 # no numeric prefix -> the whole line is the URL + rest = rest.strip() + if weight < 0 or not rest: + log.warning("ignoring yt-dlp source line with bad weight: %r", line) + return rest, 0.0 + return rest, weight + # --- resolution ------------------------------------------------------- def next(self) -> Track | None: @@ -71,15 +104,27 @@ class YtdlpProvider: if not self._urls: return None recent = self._db.recent_locators(config.ANTIREPEAT_WINDOW) - # Try source lines in random order until one yields a usable track. - candidates = list(self._urls) - random.shuffle(candidates) - for src in candidates: + # Try source lines in weighted-random order until one yields a track. + for src in self._weighted_order(): track = self._resolve(src, recent) if track is not None: return track return None + def _weighted_order(self) -> list[str]: + """Source URLs ordered by weighted-random sampling without replacement. + + Uses the Efraimidis–Spirakis key ``random() ** (1 / weight)``: a higher + weight makes a larger key likely, so heavier sources tend to come first + while still being probabilistic. Sampling without replacement keeps the + current "fall through to the next source when one fails" behaviour. + """ + keyed = [ + (random.random() ** (1.0 / weight), url) for url, weight in self._urls + ] + keyed.sort(reverse=True) + return [url for _, url in keyed] + def _resolve(self, src_url: str, recent: set[str]) -> Track | None: try: entry = self._pick_entry(src_url, recent, 0)