ingest: weight yt-dlp sources with an optional WEIGHT:URL prefix
Source lines may now start with a numeric weight (e.g. 3:https://…) to bias how often each is drawn. The prefix is unambiguous since a URL scheme can never be numeric; unprefixed lines default to weight 1 and a weight of 0 disables the line. Selection uses Efraimidis-Spirakis weighted sampling without replacement, preserving the fall-through to the next source when one fails. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
8054c98dd1
commit
5dd50f37b7
3 changed files with 64 additions and 10 deletions
|
|
@ -7,8 +7,16 @@
|
|||
# -> radieo y pioche un morceau au hasard à chaque tour,
|
||||
# en évitant ceux joués récemment.
|
||||
#
|
||||
# Poids (optionnel) : préfixer la ligne par `POIDS:URL` pour ajuster la
|
||||
# fréquence de tirage d'une source. Sans préfixe, le poids vaut 1. Le poids est
|
||||
# relatif aux autres lignes ; un poids de 0 désactive la ligne. La lecture est
|
||||
# non ambiguë : un schéma d'URL ne peut jamais être un nombre.
|
||||
# 3:https://… -> tirée 3× plus souvent qu'une ligne de poids 1
|
||||
# 0.5:https://… -> tirée 2× moins souvent
|
||||
# 0:https://… -> désactivée
|
||||
#
|
||||
# Exemples (à remplacer par les tiens) :
|
||||
# https://www.youtube.com/watch?v=dQw4w9WgXcQ
|
||||
# https://soundcloud.com/artiste/un-morceau
|
||||
# 2:https://soundcloud.com/artiste/un-morceau
|
||||
# https://artiste.bandcamp.com/album/un-album
|
||||
# https://www.youtube.com/playlist?list=PLxxxxxxxx
|
||||
# 3:https://www.youtube.com/playlist?list=PLxxxxxxxx
|
||||
|
|
|
|||
|
|
@ -66,7 +66,8 @@ SUBSONIC_ENABLED = bool(
|
|||
# --- yt-dlp source ---
|
||||
# Plain text file (one URL per line, '#' comments) mounted into the container.
|
||||
# May contain direct track URLs or container URLs (playlist/album/label/artist),
|
||||
# from which one track is picked at random. Left absent disables the source.
|
||||
# from which one track is picked at random. A line may carry an optional
|
||||
# 'WEIGHT:URL' prefix to bias how often it is drawn. Left absent disables it.
|
||||
YTDLP_URLS_FILE = Path(os.environ.get("RADIEO_YTDLP_URLS_FILE", "/config/urls.txt"))
|
||||
# How often to reload the URL list and re-expand container URLs, in seconds.
|
||||
YTDLP_REFRESH = float(os.environ.get("RADIEO_YTDLP_REFRESH", "300"))
|
||||
|
|
|
|||
|
|
@ -6,6 +6,12 @@ be either a direct track URL or a *container* URL (playlist, album, label,
|
|||
artist page); container URLs are expanded with a flat yt-dlp extraction and one
|
||||
entry is picked at random, honouring the anti-repeat window.
|
||||
|
||||
A line may carry an optional relative weight as a ``WEIGHT:URL`` prefix (e.g.
|
||||
``3:https://…``). The parse is unambiguous because URL schemes must start with a
|
||||
letter, so a numeric part before the first ``:`` can only be a weight. Lines
|
||||
without a prefix default to weight ``1``; a weight of ``0`` disables the line.
|
||||
Weights bias the *order* in which sources are tried each pick.
|
||||
|
||||
The provider only *resolves* references — it emits ``ytdlp`` tracks whose
|
||||
locator is the chosen media URL. The actual download happens in the matching
|
||||
fetcher (milestone 4). Flat extraction is cached per source URL to avoid
|
||||
|
|
@ -37,7 +43,8 @@ class YtdlpProvider:
|
|||
self._urls_file = urls_file
|
||||
self._db = db
|
||||
self._cache_dir = cache_dir
|
||||
self._urls: list[str] = []
|
||||
# (url, weight) pairs; weight > 0, disabled lines are dropped on load.
|
||||
self._urls: list[tuple[str, float]] = []
|
||||
self._loaded_at = 0.0
|
||||
# source URL -> (entries, loaded_at); entries are normalized dicts.
|
||||
self._expanded: dict[str, tuple[list[dict], float]] = {}
|
||||
|
|
@ -57,13 +64,39 @@ class YtdlpProvider:
|
|||
urls = []
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if line and not line.startswith("#"):
|
||||
urls.append(line)
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
url, weight = self._parse_line(line)
|
||||
if weight > 0:
|
||||
urls.append((url, weight))
|
||||
if urls != self._urls:
|
||||
log.info("loaded %d yt-dlp source URL(s)", len(urls))
|
||||
self._urls = urls
|
||||
self._loaded_at = now
|
||||
|
||||
@staticmethod
|
||||
def _parse_line(line: str) -> tuple[str, float]:
|
||||
"""Split an optional ``WEIGHT:URL`` prefix off a source line.
|
||||
|
||||
The part before the first ``:`` is a weight only if it parses as a
|
||||
number; a URL scheme can never be purely numeric (RFC 3986 requires a
|
||||
leading letter), so there is no ambiguity. Unprefixed lines default to
|
||||
weight ``1``; a non-positive or malformed weight is logged and the line
|
||||
is disabled (weight ``0``).
|
||||
"""
|
||||
head, sep, rest = line.partition(":")
|
||||
if not sep:
|
||||
return line, 1.0
|
||||
try:
|
||||
weight = float(head)
|
||||
except ValueError:
|
||||
return line, 1.0 # no numeric prefix -> the whole line is the URL
|
||||
rest = rest.strip()
|
||||
if weight < 0 or not rest:
|
||||
log.warning("ignoring yt-dlp source line with bad weight: %r", line)
|
||||
return rest, 0.0
|
||||
return rest, weight
|
||||
|
||||
# --- resolution -------------------------------------------------------
|
||||
|
||||
def next(self) -> Track | None:
|
||||
|
|
@ -71,15 +104,27 @@ class YtdlpProvider:
|
|||
if not self._urls:
|
||||
return None
|
||||
recent = self._db.recent_locators(config.ANTIREPEAT_WINDOW)
|
||||
# Try source lines in random order until one yields a usable track.
|
||||
candidates = list(self._urls)
|
||||
random.shuffle(candidates)
|
||||
for src in candidates:
|
||||
# Try source lines in weighted-random order until one yields a track.
|
||||
for src in self._weighted_order():
|
||||
track = self._resolve(src, recent)
|
||||
if track is not None:
|
||||
return track
|
||||
return None
|
||||
|
||||
def _weighted_order(self) -> list[str]:
|
||||
"""Source URLs ordered by weighted-random sampling without replacement.
|
||||
|
||||
Uses the Efraimidis–Spirakis key ``random() ** (1 / weight)``: a higher
|
||||
weight makes a larger key likely, so heavier sources tend to come first
|
||||
while still being probabilistic. Sampling without replacement keeps the
|
||||
current "fall through to the next source when one fails" behaviour.
|
||||
"""
|
||||
keyed = [
|
||||
(random.random() ** (1.0 / weight), url) for url, weight in self._urls
|
||||
]
|
||||
keyed.sort(reverse=True)
|
||||
return [url for _, url in keyed]
|
||||
|
||||
def _resolve(self, src_url: str, recent: set[str]) -> Track | None:
|
||||
try:
|
||||
entry = self._pick_entry(src_url, recent, 0)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue