#!/usr/bin/env perl #============================================================================= # # USAGE: ./gen_site.pl [options] [commands] # # DESCRIPTION: More efficient wget -m # # AUTHOR: Pierre-Olivier Mercier # ORGANIZATION: EPITA SRS # #============================================================================= use v5.12; use strict; use warnings; use threads; use threads::shared; use Cwd 'abs_path'; use File::Basename; use File::Copy; use File::Find; use File::Path qw/make_path remove_tree/; use File::Temp "tempdir"; use Getopt::Long; use IO::Socket; use Thread::Queue; ### GLOBALS ########################################################### our $outdir = "outest"; our $outteams = "/teams/"; our $outerrors = "/errors/"; our $outhome = "/htdocs/"; our $tmpdir_basename = File::Spec->tmpdir; our $tmpdir; our $baseurl = "http://localhost"; our $baseadmin = "/admin/"; our $basehome = "/"; our $baseerrors = "/errors/"; our $baseteams = "/connected/"; our $threads = 10; my $deamon; my $socket; my $queue :shared = Thread::Queue->new(); my $main_thread; ### GENERATORS ######################################################## # Enqueue error pages in the mirror sub genErrors(;$) { my $m = shift // Mirror->new(); $m->add_url($baseerrors . "400"); $m->add_url($baseerrors . "403"); $m->add_url($baseerrors . "404"); $m->add_url($baseerrors . "413"); $m->add_url($baseerrors . "500"); $m->add_url($baseerrors . "502"); return $m; } # Enqueue public pages in the mirror sub genHome(;$) { my $m = shift // Mirror->new(); $m->add_url($basehome . "index.html"); $m->add_url($basehome); genErrors($m); return $m; } # Enqueue all teams in the mirror sub genFull(;$) { my $m = shift // Mirror->new(); $m->add_url($baseteams); return $m; } # Enqueue a team in the mirror sub genTeam($;$) { my $team_id = shift; my $m = shift // Mirror->new(); $m->add_url($baseteams . $team_id); return $m; } # Enqueue theme pages for a given team in the mirror sub genTeamTheme($$;$) { my $team_id = shift; my $theme_id = shift; my $m = shift // Mirror->new(); $m->add_url($baseteams . $team_id . "/" . $theme_id); return $m; } ### TOOLS ############################################################# # Manage the mirror sub manage { my $m = shift // Mirror->new(); while (1) { if ($queue->pending() <= 0) { lock($queue); cond_wait($queue) while $queue->pending() <= 0; } my $cmd; { lock($queue); $cmd = $queue->peek(); } $m->{end} = 0 if ($m->{end}); for ($cmd) { if (/^freeze/) { my $dq; { lock($queue); $dq = $queue->dequeue(); if ($dq ne $cmd) { $queue->insert(0, $dq); $cmd = $dq; } } if ($dq eq $cmd) { lock($queue); cond_wait($queue) while $queue->pending() <= 0 || $queue->peek() !~ /^unfreeze/; last; } } if (/^all$/) { say "Generate all teams"; genFull($m); } elsif (/^HOME/) { say "Generate full public part"; genHome($m); } elsif (/^ERRORS?/) { say "Generate errors pages"; genErrors($m); } elsif (/^TEAM([0-9]+)$/) { say "Generate team: $1"; genTeam($1, $m); } elsif (/^T(?:E(?:A(?:M)?)?)?([0-9]+),([0-9]+)$/) { say "Generate team theme: $1/$2"; genTeamTheme($1, $2, $m); } elsif (/^(C)?(reset)*r(e(s(e(t)?)?)?)?/) { say "Performing RESET ..."; remove_tree($main::tmpdir); mkdir($main::tmpdir); $m->reset(); } elsif (/^(D)?(SYNC)*S(Y(N(C)?)?)?/) { sync($1); } elsif (/^LS$/) { system("ls '$main::tmpdir'"); } elsif (/^J(O(I(N)?)?)?$/) { say "JOIN receive, stopping all threads..."; $m->stop(); return 1; } elsif (/^RT(E(A(M(S)?)?)?)?/) { qx(./pki/CA.sh -gencrl); if (-x "nginx_gen_team.sh") { qx(./nginx_gen_team.sh > ./shared/nginx-teams.conf) } else { say "Unable to find nginx_gen_team.sh" } } elsif (/^help/i) { say "TODO, sorry :("; } else { say "$cmd is not a valid command"; } } next if ! $m->join(); say ">>> $cmd done"; { lock($queue); my $dq = $queue->dequeue(); if ($cmd ne $dq) { $queue->insert(0, $dq); for (my $i = 0; $i < $queue->pending(); $i++) { if ($queue->peek($i) eq $cmd) { $queue->extract($i); last; } } } } } } # Perform a synchronization of the temporary mirrored directory sub sync { if (shift) { say "Full synchronization to $main::outdir"; my $tmpcopy = tempdir(DIR => $tmpdir_basename, CLEANUP => 1); find( sub { if (-f) { my $todir = $File::Find::dir."/"; if ($todir =~ /^\Q$main::tmpdir\E\/?(\Q$main::baseadmin\E|\Q$main::baseteams\E|\Q$main::baseerrors\E|\Q$main::basehome\E)(.*)$/) { $todir = $tmpcopy; return if ($1 eq $main::baseadmin); $todir .= $main::outteams if ($1 eq $main::baseteams); $todir .= $main::outerrors if ($1 eq $main::baseerrors); $todir .= $main::outhome if ($1 eq $main::basehome); $todir .= $2; } make_path($todir, { mode => 0751 }) if (! -d $todir ); copy($File::Find::name, $todir) or warn(q{copy failed:} . $!); } }, $tmpdir ); abs_path($main::outdir); abs_path($tmpcopy); remove_tree($main::outdir, {keep_root => 1}); system("mv '$tmpcopy'/* '$main::outdir/'"); } else { say "Incremental synchronization to $main::outdir"; find( sub { if (-f) { my $todir = $File::Find::dir."/"; if ($todir =~ /^\Q$main::tmpdir\E\/?(\Q$main::baseadmin\E|\Q$main::baseteams\E|\Q$main::basehome\E)(.*)$/) { $todir = $main::outdir; return if ($1 eq $main::baseadmin); $todir .= $main::outteams if ($1 eq $main::baseteams); $todir .= $main::outerrors if ($1 eq $main::baseerrors); $todir .= $main::outhome if ($1 eq $main::basehome); $todir .= $2; } make_path($todir, { mode => 0751 }) if (! -d $todir ); say "$File::Find::name -> $todir"; copy($File::Find::name, $todir) or warn(q{copy failed:} . $!); } }, $tmpdir ); } } # Parse input command and enqueue them sub parse($$;$) { my $m = shift; my $change_current = 0; my $cmds = shift; my $chan_output = shift // \*STDOUT; for my $cmd ($cmds =~ /([^:]+)/g) { if ($cmd =~ "^Creset") { if ($queue->pending() > 5) { say $chan_output "Skip item $cmd, due to huge queue length: ".$queue->pending(); next; } } my $i; if ($cmd =~ "^(un)?freeze") { $i = 0; } elsif ($cmd =~ "^clear") { lock($queue); $queue->dequeue_nb($queue->pending()); next; } elsif ($cmd =~ "^show") { lock($queue); say $chan_output "Pending queue:"; for ($i = 0; $i < $queue->pending(); $i++) { say $chan_output "$i. ", $queue->peek($i); } say $chan_output "empty!" if $queue->pending() == 0; next; } elsif ($cmd =~ "^ls") { say $chan_output "Content of $main::tmpdir:"; opendir(my $dh, $main::tmpdir) || next; while (readdir $dh) { say $chan_output "$_"; } closedir $dh; next; } else { my $len = length($cmd); # Search the right position for ($i = 0; $i < $queue->pending(); $i++) { my $itm = $queue->peek($i); last if ($cmd eq $itm); last if ($len > length($itm)); } } if ($i < $queue->pending() && $queue->peek($i) eq $cmd) { say $chan_output "Skip item $cmd, already at position $i/".$queue->pending(); } else { say $chan_output "Inserting $cmd at position $i/".$queue->pending(); $queue->insert($i, $cmd); $change_current = 1 if $i == 0 && $queue->pending() != 1; } } if ($change_current) { say "Priority item have changed, stoping running threads"; $m->end(); } { lock($queue); cond_broadcast($queue); } #print Dumper($queue); } ### SOCKETS ########################################################### # Create the socket and wait for connection sub create_socket { my $m = shift; my $socket_path = abs_path( shift ); unlink($socket_path) if -e $socket_path; my $socket = IO::Socket::UNIX->new( Local => $socket_path, Type => SOCK_STREAM, Listen => SOMAXCONN, ); chmod 0660, $socket_path; say "Socket listening on $socket_path; waiting for connections..."; while(my $connection = $socket->accept) { #say "New connexion, new thread ready for parsing actions!"; threads->create(\&socket_run, $m, $connection); } } # Manage the socket connection sub socket_run { my $m = shift; my $connection = shift; $connection->autoflush(1); say $connection "You are connected to gen_site.pl, please enter command:"; while (<$connection>) { chomp $_; parse($m, $_, $connection); print $connection "\n"; $connection->flush; } #say "Closing socket connection; stopping thread."; close $connection; } ### MAIN ################################################################ # Parse arguments my $help; GetOptions ("threads|thread|t=i" => \$threads, "baseadmin|ba=s" => \$baseadmin, "basehome|bh=s" => \$basehome, "baseerrors|be=s" => \$baseerrors, "baseteams|bt=s" => \$baseteams, "outdir|out|o=s" => \$outdir, "tempdir|tmpdir|tmp|T=s" => \$tmpdir_basename, "deamon|d" => \$deamon, "socket|s=s" => \$socket, "help|h|?" => \$help); $outdir = abs_path($outdir); $tmpdir_basename = abs_path($tmpdir_basename); # Create temporary directory $tmpdir = tempdir(DIR => $tmpdir_basename); # Daemon mode: run forever until stdin is open if ($deamon) { my $m :shared = Mirror->new(); $main_thread = threads->create(\&manage, $m); threads->create(\&create_socket, $m, $socket) if ($socket); while ($_ = shift) { parse($m, $_); } while(<>) { chomp $_; parse($m, $_); } parse($m, "J"); $main_thread->join(); } # Run given commands elsif (@ARGV) { my $m :shared = Mirror->new(); $main_thread = threads->create(\&manage, $m); threads->create(\&create_socket, $m, $socket) if ($socket); while ($_ = shift) { parse($m, $_); } parse($m, "J"); $main_thread->join(); } # Just performs a full generation else { my $m = genFull(); genHome($m); $m->join(); sync(1); } # Clean tmp remove_tree($main::tmpdir); package Mirror; # Structure to store a mirror state of a website use v5.10.1; use strict; use warnings; use threads; use threads::shared; use Thread::Queue; # Initialize the class sub new($) { my $class = shift; my $self :shared = shared_clone({ threads => [], stop => 0, }); bless $self, $class; $self->start(); return $self; } # Launch threads and do more initialization sub start($) { my $self = shift; $self->{add} = Thread::Queue->new("#RESET"); $self->{todo} = Thread::Queue->new(); $self->{waiting} = 0; $self->{end} = 0; push @{ $self->{threads} }, threads->create(\&run_add, $self)->tid(); for (my $i = 0; $i < $main::threads; $i++) { push @{ $self->{threads} }, threads->create(\&run, $self, $i + 1)->tid(); } } # Enqueue a RESET sub reset($) { my $self = shift; $self->{add}->enqueue("#RESET"); } # Clean queues and wait for stop threads sub end($) { my $self = shift; $self->{end} = 1; while($self->{add}->dequeue_nb()) { } while($self->{todo}->dequeue_nb()) { } $self->reset(); { lock($self); cond_broadcast($self); } return $self->join(); } sub stop($) { my $self = shift; $self->{stop} = 1; return $self->end(); } # Enqueue URLs sub add_url($@) { my $self = shift; while (my $link = shift) { $self->{add}->enqueue($link); } } # Function executed to prepare URL to fetch sub run_add { my $self = shift; my $id = shift // 0; my %urlseen; while (! $self->{stop}) { my $link = $self->{add}->dequeue_nb(); if (! defined $link) { { lock($self); $self->{waiting} += 1; cond_broadcast($self); } $link = $self->{add}->dequeue(); last if (! defined $link); { lock($self); $self->{waiting} -= 1; } } next if (exists $urlseen{$link}); if ($link eq "#RESET") { %urlseen = (); } else { $self->{todo}->enqueue($link); $urlseen{$link} = 1; } } } # Main function to fetch pages sub run { my $self = shift; my $id = shift // 0; while (! $self->{stop}) { my $url = $self->{todo}->dequeue_nb(); if (! defined $url) { { lock($self); $self->{waiting} += 1; cond_broadcast($self); } $url = $self->{todo}->dequeue(); last if (! defined $url); { lock($self); $self->{waiting} -= 1; } } say "[$id] $baseurl$url"; my $p = FicPage->new($baseurl . $url); $p->fetch(); $p->toRightURL() if ($url !~ /\/$/); my @links = $p->getNearLinks() if ! $self->{end}; $p->treatLinks(); $p->save(); for my $link (@links) { $self->add_url($link) if ($link ne $url && ($link =~ /^?\Q$url\E/) || $p->{url} =~ /\.css$/); } } } # Join sub join($) { my $self = shift; sleep 1 if ($self->{waiting} == $main::threads + 1); return ($self->{waiting} > 0 && ! $self->{end}) if ($self->{waiting} == $main::threads + 1); { lock($self); cond_wait($self) until ($self->{waiting} >= $main::threads + 1 || $self->{waiting} < 0 || $self->{stop}); } return ($self->{waiting} > 0 && ! $self->{end}); } package FicPage; # Represent a web page in order to perform some treatment on it use v5.10.1; use strict; use warnings; use File::Basename; use File::Path qw(make_path); use HTTP::Request::Common qw(GET POST); use LWP::UserAgent; # Initialize the class sub new { my $class = shift; my $self = { url => shift, }; bless $self, $class; return $self; } # Fetch the page content sub fetch($) { my $self = shift; my $ua = LWP::UserAgent->new; my $res = $ua->request(GET $self->{url}); $self->{content} = $res->content; } # If the URL store is a short one, try to expand it by looking to correct URL sub toRightURL($) { my $self = shift; my $search = $self->{url}; $search =~ s!/$!!; for my $url ($self->getLinks()) { $url = $baseurl . $url; if ($url =~ /^\Q$search\E/) { $url =~ s!^(\Q$search\E[^/]*).*$!$1/!; $self->{url} = $url; return $url; } } return $self->{url}; } # Retrieve all links in the page content sub getLinks($) { my $self = shift; return $self->{content} =~ /(?:src|href|action)="([^"]+)"/g; } # Search relative (and absolute linking to the site part) links sub getNearLinks($) { my $self = shift; if ($self->{url} =~ /\.css$/) { return $self->{content} =~ /url\(["']?(?:\Q$main::baseurl\E|(?:\/?\.\.)+)?([^:)"'?#]+)/g; } else { my @links = $self->{content} =~ /(?:src|href)="(?:\Q$main::baseurl\E|(?:\/?\.\.)+)?([^:"]+)"/g; for my $action ($self->{content} =~ /action="(?:\Q$main::baseurl\E|(?:\/?\.\.)+)?([^:"]+)"/g) { push @links, $action; push @links, "$action/gerr"; push @links, "$action/serr"; } return @links; } } # Remove SALT base from URL contained in the page sub treatLinks($) { my $self = shift; $self->{content} =~ s!(src|href|action)="( \Q$baseteams\E[^/]+/ | \Q$baseadmin\E | \Q$basehome\E)([^"]*)"!$1="/$3"!gx; } # Generate the path where saved the page content sub getSavePath($;$) { my $self = shift; my $basedir = shift // $main::tmpdir; # Convert URL to real directory path my $path = $self->{url}; $path =~ s/^\Q$main::baseurl\E//; return "$basedir$path"; } # Is the page already saved? sub alreadySaved($;$) { my $self = shift; my $path = $self->getSavePath(@_); return -f $path || ( -d $path && -f "$path/index.html" ); } # Really save the page content sub save($;$) { my $self = shift; # Convert URL to real directory path my $path = $self->getSavePath(@_); eval { if ($path =~ /\.[a-z0-9]{2,4}$/) { eval { make_path( dirname("$path") , { mode => 0751 }) if (! -d dirname("$path") ); }; open my $fd, ">", $path or die "$path: $!"; print $fd $self->{content}; close $fd; } else { eval { make_path "$path", { mode => 0751 } if (! -d "$path"); }; open my $fd, ">", "$path/index.html" or die "$path: $!"; print $fd $self->{content}; close $fd; } }; print $@ if ($@); } __END__ =head1 NAME FIC parallel WGET =head1 USAGE ./gen_site.pl [OPTIONS] [COMMANDS] =head1 DESCRIPTION TODO =head1 OPTIONS =over =item B<-baseadmin=string> Called SALT_ADMIN in PHP part. Default: C =item B<-baseerrors=string> There is no equivalent in PHP part, it's harcoded. Default: C =item B<-basehome=string> Called SALT_PUBLIC in PHP part. Default: C =item B<-baseteams=string> Called SALT_USER in PHP part. Default: C =item B<-deamon> Run forever (until JOIN scheduler instruction occurs). =item B<-help> Displays the help. =item B<-outdir=path> Path where save generated pages =item B<-socket=path> Path to the socket to create. =item B<-tempdir=path> Path to use as basename for temporary directories. =back =item B<-threads=int> Number of parallel fetch to perform. =back =head1 SCHEDULER COMMANDS =over =item B Freeze the scheduler until unfreeze is sent. =item B Generate pages for all teams. =item B Generate pages for the public part. =item B Generate errors pages. =item B Generate all pages for the team C<00>. =item B Generate pages for the C<11> theme for the team C<00>. =item B Clean the temporary directory. =item B Clean the temporary directory, only if the queue has less than 5 items. =item B Made a full synchronization of the output directory (remove existing output dir and replace it by the current temporary content and reset temporary content). =item B Made a incremental synchronization with the output directory (just copy all files in the current temporary content to the output dir, don't remove anything). =item B Perform a C in the temporary directory content. =item B Flush the scheduler queue and wait for all jobs done. =item B Regenerate the nginx file containing teams IDs. =item B Display some help. =back =head1 DEPENDENCIES =over =item perl >= 5.10.1, compilated with threads support =item HTTP::Request::Common =back =head1 AUTHOR Pierre-Olivier Mercier =head1 LICENSE AND COPYRIGHT Copyright (c) 2014 Pierre-Olivier Mercier