New script to mirror pages
This commit is contained in:
parent
1d7c89994d
commit
db77d8b697
434
gen_site.pl
434
gen_site.pl
@ -5,119 +5,413 @@ use strict;
|
||||
use warnings;
|
||||
use threads;
|
||||
use threads::shared;
|
||||
|
||||
use Getopt::Long;
|
||||
use Thread::Queue;
|
||||
|
||||
our $baseurl = "http://localhost/";
|
||||
our $baseadmin = "admin/";
|
||||
our $basehome = "";
|
||||
our $baseteams = "connected/";
|
||||
our $threads = 5;
|
||||
our $outdir = "outest";
|
||||
|
||||
our $baseurl = "http://localhost";
|
||||
our $baseadmin = "/admin/";
|
||||
our $basehome = "/";
|
||||
our $baseteams = "/connected/";
|
||||
our $threads = 6;
|
||||
|
||||
|
||||
sub test
|
||||
sub genHome(;$)
|
||||
{
|
||||
my $p = FicPage->new("http://localhost/connected/169/6");
|
||||
say $p->getSavePath();
|
||||
$p->fetch();
|
||||
$p->toRightURL();
|
||||
$p->treatLinks();
|
||||
print $p->getNearLinks();
|
||||
$p->save();
|
||||
my $m = shift // Mirror->new();
|
||||
$m->add_url($basehome);
|
||||
|
||||
return $m;
|
||||
}
|
||||
|
||||
sub genHome()
|
||||
sub genFull(;$)
|
||||
{
|
||||
mirror($baseurl . $basehome);
|
||||
my $m = shift // Mirror->new();
|
||||
$m->add_url($baseteams);
|
||||
|
||||
return $m;
|
||||
}
|
||||
|
||||
sub genTeam($)
|
||||
sub genTeam($;$)
|
||||
{
|
||||
my $team_id = shift;
|
||||
my $m = shift // Mirror->new();
|
||||
|
||||
my $p = FicPage->new($baseurl . $baseteams . $team_id);
|
||||
$p->fetch();
|
||||
$m->add_url($baseteams . $team_id);
|
||||
|
||||
my @ths;
|
||||
|
||||
for my $link ($p->getNearLinks()) {
|
||||
push @ths, threads->create(\&mirror, $link) if ($link =~ /\Q$baseteams\E/);
|
||||
}
|
||||
|
||||
for my $th (@ths) {
|
||||
$th->join();
|
||||
}
|
||||
|
||||
return $m;
|
||||
}
|
||||
|
||||
sub genFull()
|
||||
sub genTeamTheme($$;$)
|
||||
{
|
||||
my $p = FicPage->new($baseurl . $baseteams);
|
||||
$p->fetch();
|
||||
my $team_id = shift;
|
||||
my $theme_id = shift;
|
||||
my $m = shift // Mirror->new();
|
||||
|
||||
my @ths;
|
||||
$m->add_url($baseteams . $team_id . "/" . $theme_id);
|
||||
|
||||
for my $link ($p->getNearLinks()) {
|
||||
push @ths, threads->create(\&mirror, $link) if ($link =~ /\Q$baseteams\E/);
|
||||
}
|
||||
|
||||
for my $th (@ths) {
|
||||
$th->join();
|
||||
}
|
||||
return $m;
|
||||
}
|
||||
|
||||
sub parse($)
|
||||
|
||||
my $queue :shared = Thread::Queue->new();
|
||||
my $main_thread;
|
||||
|
||||
sub manage
|
||||
{
|
||||
say shift;
|
||||
}
|
||||
my $m = shift // Mirror->new();
|
||||
|
||||
sub mirror
|
||||
{
|
||||
my $p = FicPage->new($baseurl . $_[0]);
|
||||
|
||||
return if ($p->alreadySaved());
|
||||
|
||||
$p->fetch();
|
||||
|
||||
my @links = $p->getNearLinks();
|
||||
|
||||
$p->treatLinks();
|
||||
$p->save();
|
||||
|
||||
for my $link (@links)
|
||||
while (1)
|
||||
{
|
||||
mirror($link) if ($_[0] ne $link);
|
||||
if ($queue->pending() <= 0)
|
||||
{
|
||||
lock($queue);
|
||||
cond_wait($queue) while $queue->pending() <= 0;
|
||||
}
|
||||
|
||||
my $cmd;
|
||||
{
|
||||
lock($queue);
|
||||
$cmd = $queue->peek();
|
||||
}
|
||||
|
||||
$m->{end} = 0 if ($m->{end});
|
||||
|
||||
for ($cmd)
|
||||
{
|
||||
if (/^all$/)
|
||||
{
|
||||
say "Generate all teams";
|
||||
genFull($m);
|
||||
}
|
||||
elsif (/^HOME$/)
|
||||
{
|
||||
say "Generate full public part";
|
||||
genHome($m);
|
||||
}
|
||||
elsif (/^TEAM([0-9]+)$/)
|
||||
{
|
||||
say "Generate team: $1";
|
||||
genTeam($1, $m);
|
||||
}
|
||||
elsif (/^TEAM([0-9]+),([0-9]+)$/)
|
||||
{
|
||||
say "Generate team theme: $1/$2";
|
||||
genTeamTheme($1, $2, $m);
|
||||
}
|
||||
elsif (/^(reset)*r(e(s(e(t)?)?)?)?/)
|
||||
{
|
||||
say "Performing RESET ...";
|
||||
$m->reset();
|
||||
}
|
||||
elsif (/^J$/)
|
||||
{
|
||||
say "JOIN receive, stopping all threads...";
|
||||
$m->stop();
|
||||
return 1;
|
||||
}
|
||||
elsif (/^help/i)
|
||||
{
|
||||
say "TODO, sorry :(";
|
||||
}
|
||||
else {
|
||||
say "$cmd is not a valid command";
|
||||
}
|
||||
}
|
||||
|
||||
next if ! $m->join();
|
||||
say ">>> $cmd done";
|
||||
{
|
||||
lock($queue);
|
||||
|
||||
my $dq = $queue->dequeue();
|
||||
if ($cmd ne $dq)
|
||||
{
|
||||
$queue->insert(0, $dq);
|
||||
|
||||
for (my $i = 0; $i < $queue->pending(); $i++)
|
||||
{
|
||||
if ($queue->peek($i) eq $cmd)
|
||||
{
|
||||
$queue->extract($i);
|
||||
last;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sub parse($$)
|
||||
{
|
||||
my $m = shift;
|
||||
my $change_current = 0;
|
||||
|
||||
for my $cmd ($_[0] =~ /([^:]+)/g)
|
||||
{
|
||||
my $len = length($cmd);
|
||||
|
||||
# Search the right position
|
||||
my $i;
|
||||
for ($i = 0; $i < $queue->pending(); $i++)
|
||||
{
|
||||
last if ($len > length($queue->peek($i)));
|
||||
}
|
||||
#say "Inserting $cmd at position $i/".$queue->pending();
|
||||
$queue->insert($i, $cmd);
|
||||
$change_current = 1 if $i == 0 && $queue->pending() != 1;
|
||||
}
|
||||
|
||||
if ($change_current)
|
||||
{
|
||||
say "Priority item have changed, stoping running threads";
|
||||
$m->end();
|
||||
}
|
||||
|
||||
{
|
||||
lock($queue);
|
||||
cond_broadcast($queue);
|
||||
}
|
||||
|
||||
#print Dumper($queue);
|
||||
}
|
||||
|
||||
|
||||
# Parse arguments
|
||||
my $help; my $deamon;
|
||||
GetOptions ("threads|thread|t=i" => \$threads,
|
||||
"baseadmin|ba=s" => \$baseadmin,
|
||||
"basehome|bh=s" => \$basehome,
|
||||
"baseteams|bt=s" => \$baseteams,
|
||||
"deamon|d|?" => \$deamon,
|
||||
"outdir|out|o=s" => \$outdir,
|
||||
"deamon|d" => \$deamon,
|
||||
"help|h|?" => \$help);
|
||||
|
||||
if ($deamon)
|
||||
{
|
||||
my $m :shared = Mirror->new();
|
||||
|
||||
$main_thread = threads->create(\&manage, $m);
|
||||
|
||||
while(<>) {
|
||||
parse($_);
|
||||
chomp $_;
|
||||
parse($m, $_);
|
||||
}
|
||||
|
||||
parse($m, "J");
|
||||
$main_thread->join();
|
||||
}
|
||||
elsif (@ARGV)
|
||||
{
|
||||
while (shift) {
|
||||
parse($_);
|
||||
my $m :shared = Mirror->new();
|
||||
|
||||
$main_thread = threads->create(\&manage, $m);
|
||||
|
||||
while ($_ = shift) {
|
||||
parse($m, $_);
|
||||
}
|
||||
|
||||
parse($m, "J");
|
||||
$main_thread->join();
|
||||
}
|
||||
else
|
||||
{
|
||||
my $th = threads->create(\&genHome);
|
||||
genFull();
|
||||
my $m = genFull();
|
||||
genHome($m);
|
||||
|
||||
$th->join();
|
||||
$m->join();
|
||||
}
|
||||
|
||||
|
||||
package Mirror;
|
||||
|
||||
use v5.10.1;
|
||||
use strict;
|
||||
use warnings;
|
||||
use threads;
|
||||
use threads::shared;
|
||||
|
||||
use Thread::Queue;
|
||||
|
||||
sub new($)
|
||||
{
|
||||
my $class = shift;
|
||||
my $self :shared = shared_clone({
|
||||
threads => [],
|
||||
stop => 0,
|
||||
});
|
||||
|
||||
bless $self, $class;
|
||||
|
||||
$self->start();
|
||||
|
||||
return $self;
|
||||
}
|
||||
|
||||
sub start($)
|
||||
{
|
||||
my $self = shift;
|
||||
|
||||
$self->{add} = Thread::Queue->new("#RESET");
|
||||
$self->{todo} = Thread::Queue->new();
|
||||
$self->{waiting} = 0;
|
||||
$self->{end} = 0;
|
||||
|
||||
push @{ $self->{threads} }, threads->create(\&run_add, $self)->tid();
|
||||
|
||||
for (my $i = 0; $i < $main::threads; $i++) {
|
||||
push @{ $self->{threads} }, threads->create(\&run, $self, $i + 1)->tid();
|
||||
}
|
||||
}
|
||||
|
||||
sub reset($)
|
||||
{
|
||||
my $self = shift;
|
||||
|
||||
$self->{add}->enqueue("#RESET");
|
||||
}
|
||||
|
||||
sub end($)
|
||||
{
|
||||
my $self = shift;
|
||||
|
||||
$self->{end} = 1;
|
||||
|
||||
while($self->{add}->dequeue_nb()) { }
|
||||
while($self->{todo}->dequeue_nb()) { }
|
||||
|
||||
$self->reset();
|
||||
|
||||
{
|
||||
lock($self);
|
||||
|
||||
cond_broadcast($self);
|
||||
}
|
||||
|
||||
return $self->join();
|
||||
}
|
||||
|
||||
sub stop($)
|
||||
{
|
||||
my $self = shift;
|
||||
|
||||
$self->{stop} = 1;
|
||||
|
||||
return $self->end();
|
||||
}
|
||||
|
||||
sub add_url($@)
|
||||
{
|
||||
my $self = shift;
|
||||
|
||||
while (my $link = shift)
|
||||
{
|
||||
$self->{add}->enqueue($link);
|
||||
}
|
||||
}
|
||||
|
||||
sub run_add
|
||||
{
|
||||
my $self = shift;
|
||||
my $id = shift // 0;
|
||||
|
||||
my %urlseen;
|
||||
|
||||
while (! $self->{stop})
|
||||
{
|
||||
my $link = $self->{add}->dequeue_nb();
|
||||
|
||||
if (! defined $link)
|
||||
{
|
||||
{
|
||||
lock($self);
|
||||
$self->{waiting} += 1;
|
||||
cond_broadcast($self);
|
||||
}
|
||||
$link = $self->{add}->dequeue();
|
||||
last if (! defined $link);
|
||||
{
|
||||
lock($self);
|
||||
$self->{waiting} -= 1;
|
||||
}
|
||||
}
|
||||
|
||||
next if (exists $urlseen{$link});
|
||||
|
||||
if ($link eq "#RESET")
|
||||
{
|
||||
%urlseen = ();
|
||||
}
|
||||
else
|
||||
{
|
||||
$self->{todo}->enqueue($link);
|
||||
$urlseen{$link} = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sub run
|
||||
{
|
||||
my $self = shift;
|
||||
my $id = shift // 0;
|
||||
|
||||
while (! $self->{stop})
|
||||
{
|
||||
my $url = $self->{todo}->dequeue_nb();
|
||||
|
||||
if (! defined $url)
|
||||
{
|
||||
{
|
||||
lock($self);
|
||||
$self->{waiting} += 1;
|
||||
cond_broadcast($self);
|
||||
}
|
||||
$url = $self->{todo}->dequeue();
|
||||
last if (! defined $url);
|
||||
{
|
||||
lock($self);
|
||||
$self->{waiting} -= 1;
|
||||
}
|
||||
}
|
||||
|
||||
say "[$id] $baseurl$url";
|
||||
|
||||
my $p = FicPage->new($baseurl . $url);
|
||||
$p->fetch();
|
||||
|
||||
$p->toRightURL() if ($url !~ /\/$/);
|
||||
|
||||
my @links = $p->getNearLinks() if ! $self->{end};
|
||||
|
||||
$p->treatLinks();
|
||||
$p->save();
|
||||
|
||||
for my $link (@links)
|
||||
{
|
||||
$self->add_url($link) if ($link ne $url && $link =~ /^?\Q$url\E/);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sub join($)
|
||||
{
|
||||
my $self = shift;
|
||||
|
||||
sleep 1 if ($self->{waiting} == $main::threads + 1);
|
||||
|
||||
return ($self->{waiting} > 0 && ! $self->{end}) if ($self->{waiting} == $main::threads + 1);
|
||||
|
||||
{
|
||||
lock($self);
|
||||
cond_wait($self) until ($self->{waiting} >= $main::threads + 1 || $self->{waiting} < 0 || $self->{stop});
|
||||
}
|
||||
|
||||
return ($self->{waiting} > 0 && ! $self->{end});
|
||||
}
|
||||
|
||||
|
||||
package FicPage;
|
||||
|
||||
use v5.10.1;
|
||||
@ -148,15 +442,13 @@ sub toRightURL($)
|
||||
my $search = $self->{url};
|
||||
$search =~ s!/$!!;
|
||||
|
||||
for my $url ($self->getNearLinks())
|
||||
for my $url ($self->getLinks())
|
||||
{
|
||||
$url = $baseurl . $url;
|
||||
if ($url =~ /^\Q$search\E/)
|
||||
{
|
||||
$url =~ s!^(\Q$search\E[^/]*).*$!$1/!;
|
||||
|
||||
$url .= "/" if ($url !~ /\.[a-z0-9]{2,4}$/);
|
||||
|
||||
$self->{url} = $url;
|
||||
return $url;
|
||||
}
|
||||
@ -176,14 +468,14 @@ sub getNearLinks($)
|
||||
{
|
||||
my $self = shift;
|
||||
|
||||
return $self->{content} =~ /(?:src|href)="(?:\Q$baseurl\E|(?:\/?\.\.)+|\/)([^"]+)"/g;
|
||||
return $self->{content} =~ /(?:src|href)="(?:\Q$main::baseurl\E|(?:\/?\.\.)+)?([^:"]+)"/g;
|
||||
}
|
||||
|
||||
sub treatLinks($)
|
||||
{
|
||||
my $self = shift;
|
||||
|
||||
$self->{content} =~ s!(src|href)="/( \Q$baseteams\E[^/]+/ | \Q$baseadmin\E | \Q$basehome\E)([^"]*)"!$1="/$3"!gx;
|
||||
$self->{content} =~ s!(src|href)="( \Q$baseteams\E[^/]+/ | \Q$baseadmin\E | \Q$basehome\E)([^"]*)"!$1="/$3"!gx;
|
||||
}
|
||||
|
||||
sub fetch($)
|
||||
@ -207,13 +499,13 @@ sub alreadySaved($;$)
|
||||
sub getSavePath($;$)
|
||||
{
|
||||
my $self = shift;
|
||||
my $basedir = shift // "outest";
|
||||
my $basedir = shift // $main::outdir;
|
||||
|
||||
# Convert URL to real directory path
|
||||
my $path = $self->{url};
|
||||
$path =~ s/^\Q$main::baseurl\E//;
|
||||
|
||||
return "$basedir/$path";
|
||||
return "$basedir$path";
|
||||
}
|
||||
|
||||
sub save($;$)
|
||||
|
Loading…
Reference in New Issue
Block a user