Add atom feed, update main script and fix some issues

This commit is contained in:
nemunaire 2012-04-15 16:30:53 +02:00 committed by root
parent cf8262d3cb
commit 30cd0ba548
5 changed files with 326 additions and 109 deletions

13
dl.sh
View File

@ -1,4 +1,4 @@
#!/bin/sh #!/bin/bash
pwd=`echo "$0" | sed -e "s/[^\/]*$//"` pwd=`echo "$0" | sed -e "s/[^\/]*$//"`
@ -28,9 +28,16 @@ if [ ! -e /tmp/dlEnCours ]; then
echo "-- " >> $pwd/database echo "-- " >> $pwd/database
echo "$lign" > $f.ec echo "$lign" > $f.ec
$pwd/youtube-dl --no-progress -c -o "$pwd/content/%(id)s.%(ext)s" "$lign" $pwd/youtube-dl -c -o "$pwd/content/%(id)s.%(ext)s" "$lign"
# $pwd/youtube-dl --no-progress -c -o "$pwd/content/%(id)s.%(ext)s" "$lign"
echo "$lign" >> $f.ec echo "$lign" >> $f.ec
$pwd/youtube-dl --no-progress -c -k -o "$pwd/content/%(id)s.%(ext)s" --extract-audio --audio-format=mp3 "$lign" echo "${f:$((${#pwd} + 7)):4}"
if [ "${f:$((${#pwd} + 7)):4}" == "nemu" ]; then
echo "nemu convertion"
$pwd/youtube-dl --no-progress -c -k -o "$pwd/content/%(id)s.%(ext)s" --extract-audio --audio-format=vorbis "$lign"
else
$pwd/youtube-dl --no-progress -c -k -o "$pwd/content/%(id)s.%(ext)s" --extract-audio --audio-format=mp3 "$lign"
fi
echo "" > $f.ec echo "" > $f.ec
echo "$lign" >> $f.done echo "$lign" >> $f.done

119
htdocs/atom.php Normal file
View File

@ -0,0 +1,119 @@
<?php
define("MAIN_DIR", __dir__."/..");
function get_info($url)
{
$ec = file(MAIN_DIR."/database",FILE_IGNORE_NEW_LINES|FILE_SKIP_EMPTY_LINES);
$nbLign = count($ec);
for ($i = 0; $i < $nbLign; $i++)
{
if ($ec[$i] == $url)
break;
else
{
while ($i < $nbLign && trim($ec[$i]) != "--")
$i++;
}
}
if ($i+2 < $nbLign)
{
if ($ec[$i+3] == "-- ")
@$filename = preg_replace("#^(.+)\.([a-zA-Z0-9]{1,4})$#", '\1.mp3', $ec[$i+2]);
else
@$filename = preg_replace("#^(.+)\.([a-zA-Z0-9]{1,4})$#", '\1.mp3', $ec[$i+3]);
@$filenameogg = preg_replace("#^(.+)\.([a-zA-Z0-9]{1,4})$#", '\1.ogg', $filename);
return @array($ec[$i], $ec[$i+1], $ec[$i+2], $ec[$i+3], trim($filename), trim($filenameogg));
}
else
return NULL;
}
$user = "generic";
foreach ($_GET as $k => $t)
{
if (empty($t))
{
$user = $k;
break;
}
}
if (!preg_match("#^[a-zA-Z0-9_]+$#", $user))
die ("Le nom d'utilisateur contient des caractères interdits.");
header("Content-type: application/atom+xml;charset=utf-8");
$xml = new DOMDocument('1.0', 'UTF-8');
$xml->formatOutput = true;
$xml_feed = $xml->createElement("feed");
$xml_feed->setAttribute("xmlns", "http://www.w3.org/2005/Atom");
$xml_feed_link = $xml->createElement("link");
$xml_feed_link->setAttribute("rel", "self");
$xml_feed_link->setAttribute("href", "http://".$_SERVER["SERVER_NAME"].dirname($_SERVER["REQUEST_URI"])."atom.php?".$user);
$xml_feed_rights = $xml->createElement("rights", "Pommultimédia Online Converter");
$xml_feed_rights->setAttribute("type", "text");
$xml_feed_author = $xml->createElement("author");
$xml_feed_author->appendChild($xml->createElement("name", "nemunaire"));
$xml_feed->appendChild($xml->createElement("title", "Musiques téléchargées de ".$user));
$xml_feed->appendChild($xml->createElement("updated", date('c')));
$xml_feed->appendChild($xml->createElement("id", "http://musik.p0m.fr/atom.php?".$user));
$xml_feed->appendChild($xml_feed_link);
$xml_feed->appendChild($xml->createElement("generator", "Onyx Atom generator"));
$xml_feed->appendChild($xml_feed_rights);
$xml_feed->appendChild($xml_feed_author);
$files = array();
if (is_file(MAIN_DIR."/users/".$user.".dlist.done"))
{
$ec = file(MAIN_DIR."/users/".$user.".dlist.done",FILE_IGNORE_NEW_LINES|FILE_SKIP_EMPTY_LINES);
$cnt = 0;
for ($i = count ($ec) - 1; $i >= 0 && $cnt < 10; $i--, $cnt++)
{
$musik = get_info($ec[$i]);
if (isset($musik) && (is_file (MAIN_DIR."/content/".$musik[4]) || is_file (MAIN_DIR."/content/".$musik[5])))
{
$xml_entry = $xml->createElement("entry");
$xml_entry->appendChild($xml->createElement("id", "http://musik.p0m.fr/".$musik[0]));
@$xml_entry->appendChild($xml->createElement("title", strip_tags($musik[1])));
if (is_file (MAIN_DIR."/content/".$musik[5]))
$xml_entry->appendChild($xml->createElement("updated", date('c', filectime(MAIN_DIR."/content/".$musik[5]))));
else
$xml_entry->appendChild($xml->createElement("updated", date('c', filectime(MAIN_DIR."/content/".$musik[4]))));
$xml_entry_summary = $xml->createElement("summary", htmlentities(utf8_decode($musik[2])));
$xml_entry_summary->setAttribute("type", "html");
$xml_entry_enclosure = $xml_entry->appendChild($xml->createElement("link"));
$xml_entry_enclosure->setAttribute("href", "http://".$_SERVER["SERVER_NAME"].dirname($_SERVER["REQUEST_URI"]).urlencode("dl.php?".$user."&f=".$i));
$xml_entry_enclosure->setAttribute("rel", "enclosure");
$xml_entry->appendChild($xml_entry_enclosure);
$xml_entry_enclosure = $xml_entry->appendChild($xml->createElement("link"));
$xml_entry_enclosure->setAttribute("href", $musik[0]);
$xml_entry_enclosure->setAttribute("rel", "via");
$xml_entry->appendChild($xml_entry_enclosure);
$xml_entry_link = $xml->createElement("link");
$xml_entry_link->setAttribute("rel", "alternate");
$xml_entry_link->setAttribute("href", "http://".$_SERVER["SERVER_NAME"].dirname($_SERVER["REQUEST_URI"])."?".$user);
$xml_entry->appendChild($xml_entry_summary);
$xml_entry->appendChild($xml_entry_link);
$xml_feed->appendChild($xml_entry);
}
}
}
$xml->appendChild($xml_feed);
//Cache::set('flux', array("date" => time(), "flux" => $xml->saveXML()));
print $xml->saveXML();
?>

View File

@ -17,10 +17,14 @@ function get_info($url)
} }
} }
if ($i < $nbLign) if ($i+2 < $nbLign)
{ {
$filename = preg_replace("#^(.+)\.([a-zA-Z0-9]{1,4})$#", '\1.mp3', $ec[$i+3]); if ($ec[$i+3] == "-- ")
return array($ec[$i], $ec[$i+1], $ec[$i+2], $ec[$i+3], trim($filename)); @$filename = preg_replace("#^(.+)\.([a-zA-Z0-9]{1,4})$#", '\1.mp3', $ec[$i+2]);
else
@$filename = preg_replace("#^(.+)\.([a-zA-Z0-9]{1,4})$#", '\1.mp3', $ec[$i+3]);
@$filenameogg = preg_replace("#^(.+)\.([a-zA-Z0-9]{1,4})$#", '\1.ogg', $filename);
return @array($ec[$i], $ec[$i+1], $ec[$i+2], $ec[$i+3], trim($filename), trim($filenameogg));
} }
else else
return NULL; return NULL;
@ -50,13 +54,19 @@ if (isset($_GET["f"]))
if ($k == $_GET["f"]) if ($k == $_GET["f"])
{ {
$musik = get_info($lign); $musik = get_info($lign);
if (isset($musik) && is_file(MAIN_DIR."/content/".$musik[4])) if (isset($musik) && (is_file(MAIN_DIR."/content/".$musik[4]) || is_file(MAIN_DIR."/content/".$musik[5])))
{ {
$filename = MAIN_DIR."/content/".$musik[4]; if (is_file(MAIN_DIR."/content/".$musik[5]))
$filename = MAIN_DIR."/content/".$musik[5];
else
$filename = MAIN_DIR."/content/".$musik[4];
header('Content-Description: File Transfer'); header('Content-Description: File Transfer');
header('Content-Type: application/octet-stream'); header('Content-Type: application/octet-stream');
header("Content-Disposition: attachment; filename=\"".$musik[1].".mp3\""); if (is_file(MAIN_DIR."/content/".$musik[5]))
header("Content-Disposition: attachment; filename=\"".$musik[1].".ogg\"");
else
header("Content-Disposition: attachment; filename=\"".$musik[1].".mp3\"");
header('Content-Transfer-Encoding: binary'); header('Content-Transfer-Encoding: binary');
header('Expires: 0'); header('Expires: 0');
header('Cache-Control: must-revalidate, post-check=0, pre-check=0'); header('Cache-Control: must-revalidate, post-check=0, pre-check=0');
@ -70,4 +80,4 @@ if (isset($_GET["f"]))
} }
} }
die ("Fichier introuvable"); die ("Fichier introuvable");
?> ?>

View File

@ -20,8 +20,12 @@ function get_info($url)
if ($i+2 < $nbLign) if ($i+2 < $nbLign)
{ {
@$filename = preg_replace("#^(.+)\.([a-zA-Z0-9]{1,4})$#", '\1.mp3', $ec[$i+3]); if ($ec[$i+3] == "-- ")
return @array($ec[$i], $ec[$i+1], $ec[$i+2], $ec[$i+3], trim($filename)); @$filename = preg_replace("#^(.+)\.([a-zA-Z0-9]{1,4})$#", '\1.mp3', $ec[$i+2]);
else
@$filename = preg_replace("#^(.+)\.([a-zA-Z0-9]{1,4})$#", '\1.mp3', $ec[$i+3]);
@$filenameogg = preg_replace("#^(.+)\.([a-zA-Z0-9]{1,4})$#", '\1.ogg', $filename);
return @array($ec[$i], $ec[$i+1], $ec[$i+2], $ec[$i+3], trim($filename), trim($filenameogg));
} }
else else
return NULL; return NULL;
@ -64,7 +68,7 @@ if (is_file(MAIN_DIR."/users/".$user.".dlist.done"))
foreach ($ec as $k => $lign) foreach ($ec as $k => $lign)
{ {
$musik = get_info($lign); $musik = get_info($lign);
if(isset($musik) && is_file($dir.'/'.$musik[4])) if(isset($musik) && (is_file($dir.'/'.$musik[4]) || is_file($dir.'/'.$musik[5])))
{ {
$someone = true; $someone = true;
echo '<li><a href="dl.php?'.$user.'&amp;f='.$k.'">'.$musik[1].'</a></li>'; echo '<li><a href="dl.php?'.$user.'&amp;f='.$k.'">'.$musik[1].'</a></li>';
@ -96,7 +100,12 @@ if (empty($someone))
print "<h3 style=\"color: #00FF00;\">La demande de vidage de la liste a été ajouté à la file d'attente</h3>"; print "<h3 style=\"color: #00FF00;\">La demande de vidage de la liste a été ajouté à la file d'attente</h3>";
} }
} }
elseif (preg_match("#^http://(www.)?youtube.com/watch\?v=([a-zA-Z0-9_-]+)#", $url, $matched)) elseif (
preg_match("#^http://(www.)?youtube.com/watch\?v=([a-zA-Z0-9_-]+)#", $url, $matched)
|| preg_match("#^http://youtu.be/([a-zA-Z0-9_-]+)#", $url, $matched)
|| preg_match("#^http://(www.)?dailymotion.com/video/([a-zA-Z0-9_-]+)#", $url, $matched)
|| preg_match("#^http://(www.)?vimeo.com/([0-9]+)#", $url, $matched)
)
{ {
//Check if the URL isn't already in the file //Check if the URL isn't already in the file
if (is_file(MAIN_DIR."/users/".$user.".dlist")) if (is_file(MAIN_DIR."/users/".$user.".dlist"))

View File

@ -18,7 +18,7 @@ __authors__ = (
) )
__license__ = 'Public Domain' __license__ = 'Public Domain'
__version__ = '2012.02.26' __version__ = '2012.02.27'
UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl' UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
@ -490,6 +490,8 @@ class FileDownloader(object):
updatetime: Use the Last-modified header to set output file timestamps. updatetime: Use the Last-modified header to set output file timestamps.
writedescription: Write the video description to a .description file writedescription: Write the video description to a .description file
writeinfojson: Write the video description to a .info.json file writeinfojson: Write the video description to a .info.json file
writesubtitles: Write the video subtitles to a .srt file
subtitleslang: Language of the subtitles to download
""" """
params = None params = None
@ -681,6 +683,10 @@ class FileDownloader(object):
""" Report that the description file is being written """ """ Report that the description file is being written """
self.to_screen(u'[info] Writing video description to: ' + descfn) self.to_screen(u'[info] Writing video description to: ' + descfn)
def report_writesubtitles(self, srtfn):
""" Report that the subtitles file is being written """
self.to_screen(u'[info] Writing video subtitles to: ' + srtfn)
def report_writeinfojson(self, infofn): def report_writeinfojson(self, infofn):
""" Report that the metadata file has been written """ """ Report that the metadata file has been written """
self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn) self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
@ -808,6 +814,21 @@ class FileDownloader(object):
except (OSError, IOError): except (OSError, IOError):
self.trouble(u'ERROR: Cannot write description file ' + descfn) self.trouble(u'ERROR: Cannot write description file ' + descfn)
return return
if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
# subtitles download errors are already managed as troubles in relevant IE
# that way it will silently go on when used with unsupporting IE
try:
srtfn = filename.rsplit('.', 1)[0] + u'.srt'
self.report_writesubtitles(srtfn)
srtfile = open(_encodeFilename(srtfn), 'wb')
try:
srtfile.write(info_dict['subtitles'].encode('utf-8'))
finally:
srtfile.close()
except (OSError, IOError):
self.trouble(u'ERROR: Cannot write subtitles file ' + descfn)
return
if self.params.get('writeinfojson', False): if self.params.get('writeinfojson', False):
infofn = filename + u'.info.json' infofn = filename + u'.info.json'
@ -901,7 +922,7 @@ class FileDownloader(object):
# possible. This is part of rtmpdump's normal usage, AFAIK. # possible. This is part of rtmpdump's normal usage, AFAIK.
basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename] basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)] args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
if self.params['verbose']: if self.params.get('verbose', False):
try: try:
import pipes import pipes
shell_quote = lambda args: ' '.join(map(pipes.quote, args)) shell_quote = lambda args: ' '.join(map(pipes.quote, args))
@ -1206,6 +1227,10 @@ class YoutubeIE(InfoExtractor):
"""Report attempt to download video info webpage.""" """Report attempt to download video info webpage."""
self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id) self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
def report_video_subtitles_download(self, video_id):
"""Report attempt to download video info webpage."""
self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
def report_information_extraction(self, video_id): def report_information_extraction(self, video_id):
"""Report attempt to extract video information.""" """Report attempt to extract video information."""
self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id) self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
@ -1218,6 +1243,23 @@ class YoutubeIE(InfoExtractor):
"""Indicate the download will use the RTMP protocol.""" """Indicate the download will use the RTMP protocol."""
self._downloader.to_screen(u'[youtube] RTMP download detected') self._downloader.to_screen(u'[youtube] RTMP download detected')
def _closed_captions_xml_to_srt(self, xml_string):
srt = ''
texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
# TODO parse xml instead of regex
for n, (start, dur_tag, dur, caption) in enumerate(texts):
if not dur: dur = '4'
start = float(start)
end = start + float(dur)
start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
srt += str(n) + '\n'
srt += start + ' --> ' + end + '\n'
srt += caption + '\n\n'
return srt
def _print_formats(self, formats): def _print_formats(self, formats):
print 'Available formats:' print 'Available formats:'
for x in formats: for x in formats:
@ -1381,15 +1423,45 @@ class YoutubeIE(InfoExtractor):
lxml.etree lxml.etree
except NameError: except NameError:
video_description = u'No description available.' video_description = u'No description available.'
if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False): mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage) if mobj is not None:
if mobj is not None: video_description = mobj.group(1).decode('utf-8')
video_description = mobj.group(1).decode('utf-8')
else: else:
html_parser = lxml.etree.HTMLParser(encoding='utf-8') html_parser = lxml.etree.HTMLParser(encoding='utf-8')
vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser) vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()')) video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
# TODO use another parser # TODO use another parser
# closed captions
video_subtitles = None
if self._downloader.params.get('writesubtitles', False):
self.report_video_subtitles_download(video_id)
request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
try:
srt_list = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
else:
srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
if srt_lang_list:
if self._downloader.params.get('subtitleslang', False):
srt_lang = self._downloader.params.get('subtitleslang')
elif 'en' in srt_lang_list:
srt_lang = 'en'
else:
srt_lang = srt_lang_list[0]
if not srt_lang in srt_lang_list:
self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
else:
request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
try:
srt_xml = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
else:
video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
else:
self._downloader.trouble(u'WARNING: video has no closed captions')
# token # token
video_token = urllib.unquote_plus(video_info['token'][0]) video_token = urllib.unquote_plus(video_info['token'][0])
@ -1462,6 +1534,7 @@ class YoutubeIE(InfoExtractor):
'thumbnail': video_thumbnail.decode('utf-8'), 'thumbnail': video_thumbnail.decode('utf-8'),
'description': video_description, 'description': video_description,
'player_url': player_url, 'player_url': player_url,
'subtitles': video_subtitles
}) })
except UnavailableVideoError, err: except UnavailableVideoError, err:
self._downloader.trouble(u'\nERROR: unable to download video') self._downloader.trouble(u'\nERROR: unable to download video')
@ -2059,7 +2132,7 @@ class VimeoIE(InfoExtractor):
video_id = mobj.group(1) video_id = mobj.group(1)
# Retrieve video webpage to extract further information # Retrieve video webpage to extract further information
request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers) request = urllib2.Request(url, None, std_headers)
try: try:
self.report_download_webpage(video_id) self.report_download_webpage(video_id)
webpage = urllib2.urlopen(request).read() webpage = urllib2.urlopen(request).read()
@ -2072,77 +2145,75 @@ class VimeoIE(InfoExtractor):
# and latter we extract those that are Vimeo specific. # and latter we extract those that are Vimeo specific.
self.report_extraction(video_id) self.report_extraction(video_id)
# Extract title # Extract the config JSON
mobj = re.search(r'<caption>(.*?)</caption>', webpage) config = webpage.split(' = {config:')[1].split(',assets:')[0]
if mobj is None: try:
self._downloader.trouble(u'ERROR: unable to extract video title') config = json.loads(config)
except:
self._downloader.trouble(u'ERROR: unable to extract info section')
return return
video_title = mobj.group(1).decode('utf-8')
# Extract title
video_title = config["video"]["title"]
simple_title = _simplify_title(video_title) simple_title = _simplify_title(video_title)
# Extract uploader # Extract uploader
mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage) video_uploader = config["video"]["owner"]["name"]
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract video uploader')
return
video_uploader = mobj.group(1).decode('utf-8')
# Extract video thumbnail # Extract video thumbnail
mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage) video_thumbnail = config["video"]["thumbnail"]
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
return
video_thumbnail = mobj.group(1).decode('utf-8')
# # Extract video description # Extract video description
# mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage) try:
# if mobj is None: lxml.etree
# self._downloader.trouble(u'ERROR: unable to extract video description') except NameError:
# return video_description = u'No description available.'
# video_description = mobj.group(1).decode('utf-8') mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
# if not video_description: video_description = 'No description available.' if mobj is not None:
video_description = 'Foo.' video_description = mobj.group(1)
# Vimeo specific: extract request signature
mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract request signature')
return
sig = mobj.group(1).decode('utf-8')
# Vimeo specific: extract video quality information
mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract video quality information')
return
quality = mobj.group(1).decode('utf-8')
if int(quality) == 1:
quality = 'hd'
else: else:
quality = 'sd' html_parser = lxml.etree.HTMLParser()
vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
# TODO use another parser
# Vimeo specific: Extract request signature expiration # Extract upload date
mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage) video_upload_date = u'NA'
if mobj is None: mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
self._downloader.trouble(u'ERROR: unable to extract request signature expiration') if mobj is not None:
video_upload_date = mobj.group(1)
# Vimeo specific: extract request signature and timestamp
sig = config['request']['signature']
timestamp = config['request']['timestamp']
# Vimeo specific: extract video codec and quality information
# TODO bind to format param
codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
for codec in codecs:
if codec[0] in config["video"]["files"]:
video_codec = codec[0]
video_extension = codec[1]
if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
else: quality = 'sd'
break
else:
self._downloader.trouble(u'ERROR: no known codec found')
return return
sig_exp = mobj.group(1).decode('utf-8')
video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality) video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
%(video_id, sig, timestamp, quality, video_codec.upper())
try: try:
# Process video information # Process video information
self._downloader.process_info({ self._downloader.process_info({
'id': video_id.decode('utf-8'), 'id': video_id,
'url': video_url, 'url': video_url,
'uploader': video_uploader, 'uploader': video_uploader,
'upload_date': u'NA', 'upload_date': video_upload_date,
'title': video_title, 'title': video_title,
'stitle': simple_title, 'stitle': simple_title,
'ext': u'mp4', 'ext': video_extension,
'thumbnail': video_thumbnail.decode('utf-8'),
'description': video_description,
'thumbnail': video_thumbnail, 'thumbnail': video_thumbnail,
'description': video_description, 'description': video_description,
'player_url': None, 'player_url': None,
@ -2251,9 +2322,7 @@ class GenericIE(InfoExtractor):
class YoutubeSearchIE(InfoExtractor): class YoutubeSearchIE(InfoExtractor):
"""Information Extractor for YouTube search queries.""" """Information Extractor for YouTube search queries."""
_VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+' _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
_TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en' _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
_VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
_MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
_youtube_ie = None _youtube_ie = None
_max_youtube_results = 1000 _max_youtube_results = 1000
IE_NAME = u'youtube:search' IE_NAME = u'youtube:search'
@ -2304,45 +2373,39 @@ class YoutubeSearchIE(InfoExtractor):
"""Downloads a specified number of results for a query""" """Downloads a specified number of results for a query"""
video_ids = [] video_ids = []
already_seen = set() pagenum = 0
pagenum = 1 limit = n
while True: while (50 * pagenum) < limit:
self.report_download_page(query, pagenum) self.report_download_page(query, pagenum+1)
result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum) result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
request = urllib2.Request(result_url) request = urllib2.Request(result_url)
try: try:
page = urllib2.urlopen(request).read() data = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err: except (urllib2.URLError, httplib.HTTPException, socket.error), err:
self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err)) self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
return return
api_response = json.loads(data)['data']
# Extract video identifiers new_ids = list(video['id'] for video in api_response['items'])
for mobj in re.finditer(self._VIDEO_INDICATOR, page): video_ids += new_ids
video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
if video_id not in already_seen:
video_ids.append(video_id)
already_seen.add(video_id)
if len(video_ids) == n:
# Specified n videos reached
for id in video_ids:
self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
return
if re.search(self._MORE_PAGES_INDICATOR, page) is None: limit = min(n, api_response['totalItems'])
for id in video_ids: pagenum += 1
self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
return
pagenum = pagenum + 1 if len(video_ids) > n:
video_ids = video_ids[:n]
for id in video_ids:
self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
return
class GoogleSearchIE(InfoExtractor): class GoogleSearchIE(InfoExtractor):
"""Information Extractor for Google Video search queries.""" """Information Extractor for Google Video search queries."""
_VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+' _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
_TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en' _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
_VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&' _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
_MORE_PAGES_INDICATOR = r'<span>Next</span>' _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
_google_ie = None _google_ie = None
_max_google_results = 1000 _max_google_results = 1000
IE_NAME = u'video.google:search' IE_NAME = u'video.google:search'
@ -2393,12 +2456,11 @@ class GoogleSearchIE(InfoExtractor):
"""Downloads a specified number of results for a query""" """Downloads a specified number of results for a query"""
video_ids = [] video_ids = []
already_seen = set() pagenum = 0
pagenum = 1
while True: while True:
self.report_download_page(query, pagenum) self.report_download_page(query, pagenum)
result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum) result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
request = urllib2.Request(result_url) request = urllib2.Request(result_url)
try: try:
page = urllib2.urlopen(request).read() page = urllib2.urlopen(request).read()
@ -2409,9 +2471,8 @@ class GoogleSearchIE(InfoExtractor):
# Extract video identifiers # Extract video identifiers
for mobj in re.finditer(self._VIDEO_INDICATOR, page): for mobj in re.finditer(self._VIDEO_INDICATOR, page):
video_id = mobj.group(1) video_id = mobj.group(1)
if video_id not in already_seen: if video_id not in video_ids:
video_ids.append(video_id) video_ids.append(video_id)
already_seen.add(video_id)
if len(video_ids) == n: if len(video_ids) == n:
# Specified n videos reached # Specified n videos reached
for id in video_ids: for id in video_ids:
@ -2520,7 +2581,7 @@ class YoutubePlaylistIE(InfoExtractor):
_VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*' _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
_TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en' _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
_VIDEO_INDICATOR = r'/watch\?v=(.+?)&' _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=PL%s&'
_MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>' _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
_youtube_ie = None _youtube_ie = None
IE_NAME = u'youtube:playlist' IE_NAME = u'youtube:playlist'
@ -2572,7 +2633,7 @@ class YoutubePlaylistIE(InfoExtractor):
# Extract video identifiers # Extract video identifiers
ids_in_page = [] ids_in_page = []
for mobj in re.finditer(self._VIDEO_INDICATOR, page): for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
if mobj.group(1) not in ids_in_page: if mobj.group(1) not in ids_in_page:
ids_in_page.append(mobj.group(1)) ids_in_page.append(mobj.group(1))
video_ids.extend(ids_in_page) video_ids.extend(ids_in_page)
@ -2583,7 +2644,10 @@ class YoutubePlaylistIE(InfoExtractor):
playliststart = self._downloader.params.get('playliststart', 1) - 1 playliststart = self._downloader.params.get('playliststart', 1) - 1
playlistend = self._downloader.params.get('playlistend', -1) playlistend = self._downloader.params.get('playlistend', -1)
video_ids = video_ids[playliststart:playlistend] if playlistend == -1:
video_ids = video_ids[playliststart:]
else:
video_ids = video_ids[playliststart:playlistend]
for id in video_ids: for id in video_ids:
self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id) self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
@ -4329,6 +4393,12 @@ def parseOpts():
action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download') action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
video_format.add_option('-F', '--list-formats', video_format.add_option('-F', '--list-formats',
action='store_true', dest='listformats', help='list all available formats (currently youtube only)') action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
video_format.add_option('--write-srt',
action='store_true', dest='writesubtitles',
help='write video closed captions to a .srt file (currently youtube only)', default=False)
video_format.add_option('--srt-lang',
action='store', dest='subtitleslang', metavar='LANG',
help='language of the closed captions to download (optional) use IETF language tags like \'en\'')
verbosity.add_option('-q', '--quiet', verbosity.add_option('-q', '--quiet',
@ -4593,6 +4663,8 @@ def _real_main():
'updatetime': opts.updatetime, 'updatetime': opts.updatetime,
'writedescription': opts.writedescription, 'writedescription': opts.writedescription,
'writeinfojson': opts.writeinfojson, 'writeinfojson': opts.writeinfojson,
'writesubtitles': opts.writesubtitles,
'subtitleslang': opts.subtitleslang,
'matchtitle': opts.matchtitle, 'matchtitle': opts.matchtitle,
'rejecttitle': opts.rejecttitle, 'rejecttitle': opts.rejecttitle,
'max_downloads': opts.max_downloads, 'max_downloads': opts.max_downloads,