From db77d8b697563067ac46e9c4600696c2ddd1a327 Mon Sep 17 00:00:00 2001 From: Pierre-Olivier Mercier Date: Sun, 19 Jan 2014 14:51:18 +0100 Subject: [PATCH] New script to mirror pages --- gen_site.pl | 434 +++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 363 insertions(+), 71 deletions(-) diff --git a/gen_site.pl b/gen_site.pl index 62b512da..76c131b4 100644 --- a/gen_site.pl +++ b/gen_site.pl @@ -5,119 +5,413 @@ use strict; use warnings; use threads; use threads::shared; + use Getopt::Long; +use Thread::Queue; -our $baseurl = "http://localhost/"; -our $baseadmin = "admin/"; -our $basehome = ""; -our $baseteams = "connected/"; -our $threads = 5; +our $outdir = "outest"; + +our $baseurl = "http://localhost"; +our $baseadmin = "/admin/"; +our $basehome = "/"; +our $baseteams = "/connected/"; +our $threads = 6; -sub test +sub genHome(;$) { - my $p = FicPage->new("http://localhost/connected/169/6"); - say $p->getSavePath(); - $p->fetch(); - $p->toRightURL(); - $p->treatLinks(); - print $p->getNearLinks(); - $p->save(); + my $m = shift // Mirror->new(); + $m->add_url($basehome); + + return $m; } -sub genHome() +sub genFull(;$) { - mirror($baseurl . $basehome); + my $m = shift // Mirror->new(); + $m->add_url($baseteams); + + return $m; } -sub genTeam($) +sub genTeam($;$) { my $team_id = shift; + my $m = shift // Mirror->new(); - my $p = FicPage->new($baseurl . $baseteams . $team_id); - $p->fetch(); + $m->add_url($baseteams . $team_id); - my @ths; - - for my $link ($p->getNearLinks()) { - push @ths, threads->create(\&mirror, $link) if ($link =~ /\Q$baseteams\E/); - } - - for my $th (@ths) { - $th->join(); - } - + return $m; } -sub genFull() +sub genTeamTheme($$;$) { - my $p = FicPage->new($baseurl . $baseteams); - $p->fetch(); + my $team_id = shift; + my $theme_id = shift; + my $m = shift // Mirror->new(); - my @ths; + $m->add_url($baseteams . $team_id . "/" . $theme_id); - for my $link ($p->getNearLinks()) { - push @ths, threads->create(\&mirror, $link) if ($link =~ /\Q$baseteams\E/); - } - - for my $th (@ths) { - $th->join(); - } + return $m; } -sub parse($) + +my $queue :shared = Thread::Queue->new(); +my $main_thread; + +sub manage { - say shift; -} + my $m = shift // Mirror->new(); -sub mirror -{ - my $p = FicPage->new($baseurl . $_[0]); - - return if ($p->alreadySaved()); - - $p->fetch(); - - my @links = $p->getNearLinks(); - - $p->treatLinks(); - $p->save(); - - for my $link (@links) + while (1) { - mirror($link) if ($_[0] ne $link); + if ($queue->pending() <= 0) + { + lock($queue); + cond_wait($queue) while $queue->pending() <= 0; + } + + my $cmd; + { + lock($queue); + $cmd = $queue->peek(); + } + + $m->{end} = 0 if ($m->{end}); + + for ($cmd) + { + if (/^all$/) + { + say "Generate all teams"; + genFull($m); + } + elsif (/^HOME$/) + { + say "Generate full public part"; + genHome($m); + } + elsif (/^TEAM([0-9]+)$/) + { + say "Generate team: $1"; + genTeam($1, $m); + } + elsif (/^TEAM([0-9]+),([0-9]+)$/) + { + say "Generate team theme: $1/$2"; + genTeamTheme($1, $2, $m); + } + elsif (/^(reset)*r(e(s(e(t)?)?)?)?/) + { + say "Performing RESET ..."; + $m->reset(); + } + elsif (/^J$/) + { + say "JOIN receive, stopping all threads..."; + $m->stop(); + return 1; + } + elsif (/^help/i) + { + say "TODO, sorry :("; + } + else { + say "$cmd is not a valid command"; + } + } + + next if ! $m->join(); + say ">>> $cmd done"; + { + lock($queue); + + my $dq = $queue->dequeue(); + if ($cmd ne $dq) + { + $queue->insert(0, $dq); + + for (my $i = 0; $i < $queue->pending(); $i++) + { + if ($queue->peek($i) eq $cmd) + { + $queue->extract($i); + last; + } + } + } + } } } +sub parse($$) +{ + my $m = shift; + my $change_current = 0; + + for my $cmd ($_[0] =~ /([^:]+)/g) + { + my $len = length($cmd); + + # Search the right position + my $i; + for ($i = 0; $i < $queue->pending(); $i++) + { + last if ($len > length($queue->peek($i))); + } + #say "Inserting $cmd at position $i/".$queue->pending(); + $queue->insert($i, $cmd); + $change_current = 1 if $i == 0 && $queue->pending() != 1; + } + + if ($change_current) + { + say "Priority item have changed, stoping running threads"; + $m->end(); + } + + { + lock($queue); + cond_broadcast($queue); + } + + #print Dumper($queue); +} + + # Parse arguments my $help; my $deamon; GetOptions ("threads|thread|t=i" => \$threads, "baseadmin|ba=s" => \$baseadmin, "basehome|bh=s" => \$basehome, "baseteams|bt=s" => \$baseteams, - "deamon|d|?" => \$deamon, + "outdir|out|o=s" => \$outdir, + "deamon|d" => \$deamon, "help|h|?" => \$help); if ($deamon) { + my $m :shared = Mirror->new(); + + $main_thread = threads->create(\&manage, $m); + while(<>) { - parse($_); + chomp $_; + parse($m, $_); } + + parse($m, "J"); + $main_thread->join(); } elsif (@ARGV) { - while (shift) { - parse($_); + my $m :shared = Mirror->new(); + + $main_thread = threads->create(\&manage, $m); + + while ($_ = shift) { + parse($m, $_); } + + parse($m, "J"); + $main_thread->join(); } else { - my $th = threads->create(\&genHome); - genFull(); + my $m = genFull(); + genHome($m); - $th->join(); + $m->join(); } + +package Mirror; + +use v5.10.1; +use strict; +use warnings; +use threads; +use threads::shared; + +use Thread::Queue; + +sub new($) +{ + my $class = shift; + my $self :shared = shared_clone({ + threads => [], + stop => 0, + }); + + bless $self, $class; + + $self->start(); + + return $self; +} + +sub start($) +{ + my $self = shift; + + $self->{add} = Thread::Queue->new("#RESET"); + $self->{todo} = Thread::Queue->new(); + $self->{waiting} = 0; + $self->{end} = 0; + + push @{ $self->{threads} }, threads->create(\&run_add, $self)->tid(); + + for (my $i = 0; $i < $main::threads; $i++) { + push @{ $self->{threads} }, threads->create(\&run, $self, $i + 1)->tid(); + } +} + +sub reset($) +{ + my $self = shift; + + $self->{add}->enqueue("#RESET"); +} + +sub end($) +{ + my $self = shift; + + $self->{end} = 1; + + while($self->{add}->dequeue_nb()) { } + while($self->{todo}->dequeue_nb()) { } + + $self->reset(); + + { + lock($self); + + cond_broadcast($self); + } + + return $self->join(); +} + +sub stop($) +{ + my $self = shift; + + $self->{stop} = 1; + + return $self->end(); +} + +sub add_url($@) +{ + my $self = shift; + + while (my $link = shift) + { + $self->{add}->enqueue($link); + } +} + +sub run_add +{ + my $self = shift; + my $id = shift // 0; + + my %urlseen; + + while (! $self->{stop}) + { + my $link = $self->{add}->dequeue_nb(); + + if (! defined $link) + { + { + lock($self); + $self->{waiting} += 1; + cond_broadcast($self); + } + $link = $self->{add}->dequeue(); + last if (! defined $link); + { + lock($self); + $self->{waiting} -= 1; + } + } + + next if (exists $urlseen{$link}); + + if ($link eq "#RESET") + { + %urlseen = (); + } + else + { + $self->{todo}->enqueue($link); + $urlseen{$link} = 1; + } + } +} + +sub run +{ + my $self = shift; + my $id = shift // 0; + + while (! $self->{stop}) + { + my $url = $self->{todo}->dequeue_nb(); + + if (! defined $url) + { + { + lock($self); + $self->{waiting} += 1; + cond_broadcast($self); + } + $url = $self->{todo}->dequeue(); + last if (! defined $url); + { + lock($self); + $self->{waiting} -= 1; + } + } + + say "[$id] $baseurl$url"; + + my $p = FicPage->new($baseurl . $url); + $p->fetch(); + + $p->toRightURL() if ($url !~ /\/$/); + + my @links = $p->getNearLinks() if ! $self->{end}; + + $p->treatLinks(); + $p->save(); + + for my $link (@links) + { + $self->add_url($link) if ($link ne $url && $link =~ /^?\Q$url\E/); + } + } +} + +sub join($) +{ + my $self = shift; + + sleep 1 if ($self->{waiting} == $main::threads + 1); + + return ($self->{waiting} > 0 && ! $self->{end}) if ($self->{waiting} == $main::threads + 1); + + { + lock($self); + cond_wait($self) until ($self->{waiting} >= $main::threads + 1 || $self->{waiting} < 0 || $self->{stop}); + } + + return ($self->{waiting} > 0 && ! $self->{end}); +} + + package FicPage; use v5.10.1; @@ -148,15 +442,13 @@ sub toRightURL($) my $search = $self->{url}; $search =~ s!/$!!; - for my $url ($self->getNearLinks()) + for my $url ($self->getLinks()) { $url = $baseurl . $url; if ($url =~ /^\Q$search\E/) { $url =~ s!^(\Q$search\E[^/]*).*$!$1/!; - $url .= "/" if ($url !~ /\.[a-z0-9]{2,4}$/); - $self->{url} = $url; return $url; } @@ -176,14 +468,14 @@ sub getNearLinks($) { my $self = shift; - return $self->{content} =~ /(?:src|href)="(?:\Q$baseurl\E|(?:\/?\.\.)+|\/)([^"]+)"/g; + return $self->{content} =~ /(?:src|href)="(?:\Q$main::baseurl\E|(?:\/?\.\.)+)?([^:"]+)"/g; } sub treatLinks($) { my $self = shift; - $self->{content} =~ s!(src|href)="/( \Q$baseteams\E[^/]+/ | \Q$baseadmin\E | \Q$basehome\E)([^"]*)"!$1="/$3"!gx; + $self->{content} =~ s!(src|href)="( \Q$baseteams\E[^/]+/ | \Q$baseadmin\E | \Q$basehome\E)([^"]*)"!$1="/$3"!gx; } sub fetch($) @@ -207,13 +499,13 @@ sub alreadySaved($;$) sub getSavePath($;$) { my $self = shift; - my $basedir = shift // "outest"; + my $basedir = shift // $main::outdir; # Convert URL to real directory path my $path = $self->{url}; $path =~ s/^\Q$main::baseurl\E//; - return "$basedir/$path"; + return "$basedir$path"; } sub save($;$)