ingest: weight yt-dlp sources with an optional WEIGHT:URL prefix

Source lines may now start with a numeric weight (e.g. 3:https://…) to
bias how often each is drawn. The prefix is unambiguous since a URL
scheme can never be numeric; unprefixed lines default to weight 1 and a
weight of 0 disables the line. Selection uses Efraimidis-Spirakis
weighted sampling without replacement, preserving the fall-through to
the next source when one fails.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
nemunaire 2026-07-04 16:16:04 +08:00
commit 5dd50f37b7
3 changed files with 64 additions and 10 deletions

View file

@ -7,8 +7,16 @@
# -> radieo y pioche un morceau au hasard à chaque tour,
# en évitant ceux joués récemment.
#
# Poids (optionnel) : préfixer la ligne par `POIDS:URL` pour ajuster la
# fréquence de tirage d'une source. Sans préfixe, le poids vaut 1. Le poids est
# relatif aux autres lignes ; un poids de 0 désactive la ligne. La lecture est
# non ambiguë : un schéma d'URL ne peut jamais être un nombre.
# 3:https://… -> tirée 3× plus souvent qu'une ligne de poids 1
# 0.5:https://… -> tirée 2× moins souvent
# 0:https://… -> désactivée
#
# Exemples (à remplacer par les tiens) :
# https://www.youtube.com/watch?v=dQw4w9WgXcQ
# https://soundcloud.com/artiste/un-morceau
# 2:https://soundcloud.com/artiste/un-morceau
# https://artiste.bandcamp.com/album/un-album
# https://www.youtube.com/playlist?list=PLxxxxxxxx
# 3:https://www.youtube.com/playlist?list=PLxxxxxxxx

View file

@ -66,7 +66,8 @@ SUBSONIC_ENABLED = bool(
# --- yt-dlp source ---
# Plain text file (one URL per line, '#' comments) mounted into the container.
# May contain direct track URLs or container URLs (playlist/album/label/artist),
# from which one track is picked at random. Left absent disables the source.
# from which one track is picked at random. A line may carry an optional
# 'WEIGHT:URL' prefix to bias how often it is drawn. Left absent disables it.
YTDLP_URLS_FILE = Path(os.environ.get("RADIEO_YTDLP_URLS_FILE", "/config/urls.txt"))
# How often to reload the URL list and re-expand container URLs, in seconds.
YTDLP_REFRESH = float(os.environ.get("RADIEO_YTDLP_REFRESH", "300"))

View file

@ -6,6 +6,12 @@ be either a direct track URL or a *container* URL (playlist, album, label,
artist page); container URLs are expanded with a flat yt-dlp extraction and one
entry is picked at random, honouring the anti-repeat window.
A line may carry an optional relative weight as a ``WEIGHT:URL`` prefix (e.g.
``3:https://``). The parse is unambiguous because URL schemes must start with a
letter, so a numeric part before the first ``:`` can only be a weight. Lines
without a prefix default to weight ``1``; a weight of ``0`` disables the line.
Weights bias the *order* in which sources are tried each pick.
The provider only *resolves* references it emits ``ytdlp`` tracks whose
locator is the chosen media URL. The actual download happens in the matching
fetcher (milestone 4). Flat extraction is cached per source URL to avoid
@ -37,7 +43,8 @@ class YtdlpProvider:
self._urls_file = urls_file
self._db = db
self._cache_dir = cache_dir
self._urls: list[str] = []
# (url, weight) pairs; weight > 0, disabled lines are dropped on load.
self._urls: list[tuple[str, float]] = []
self._loaded_at = 0.0
# source URL -> (entries, loaded_at); entries are normalized dicts.
self._expanded: dict[str, tuple[list[dict], float]] = {}
@ -57,13 +64,39 @@ class YtdlpProvider:
urls = []
for line in lines:
line = line.strip()
if line and not line.startswith("#"):
urls.append(line)
if not line or line.startswith("#"):
continue
url, weight = self._parse_line(line)
if weight > 0:
urls.append((url, weight))
if urls != self._urls:
log.info("loaded %d yt-dlp source URL(s)", len(urls))
self._urls = urls
self._loaded_at = now
@staticmethod
def _parse_line(line: str) -> tuple[str, float]:
"""Split an optional ``WEIGHT:URL`` prefix off a source line.
The part before the first ``:`` is a weight only if it parses as a
number; a URL scheme can never be purely numeric (RFC 3986 requires a
leading letter), so there is no ambiguity. Unprefixed lines default to
weight ``1``; a non-positive or malformed weight is logged and the line
is disabled (weight ``0``).
"""
head, sep, rest = line.partition(":")
if not sep:
return line, 1.0
try:
weight = float(head)
except ValueError:
return line, 1.0 # no numeric prefix -> the whole line is the URL
rest = rest.strip()
if weight < 0 or not rest:
log.warning("ignoring yt-dlp source line with bad weight: %r", line)
return rest, 0.0
return rest, weight
# --- resolution -------------------------------------------------------
def next(self) -> Track | None:
@ -71,15 +104,27 @@ class YtdlpProvider:
if not self._urls:
return None
recent = self._db.recent_locators(config.ANTIREPEAT_WINDOW)
# Try source lines in random order until one yields a usable track.
candidates = list(self._urls)
random.shuffle(candidates)
for src in candidates:
# Try source lines in weighted-random order until one yields a track.
for src in self._weighted_order():
track = self._resolve(src, recent)
if track is not None:
return track
return None
def _weighted_order(self) -> list[str]:
"""Source URLs ordered by weighted-random sampling without replacement.
Uses the EfraimidisSpirakis key ``random() ** (1 / weight)``: a higher
weight makes a larger key likely, so heavier sources tend to come first
while still being probabilistic. Sampling without replacement keeps the
current "fall through to the next source when one fails" behaviour.
"""
keyed = [
(random.random() ** (1.0 / weight), url) for url, weight in self._urls
]
keyed.sort(reverse=True)
return [url for _, url in keyed]
def _resolve(self, src_url: str, recent: set[str]) -> Track | None:
try:
entry = self._pick_entry(src_url, recent, 0)