Bots section
MIRBOT 1.0
by
The Mystical Friend
September 1999
courtesy of fravia's searchlores.org
I'm happy to present you another kind of perl-bot, that should be a valuable addition
to our bot-section. I thank Bernd and/or The Mystical Friend
for this excellent contribution.
Please send all feedback and amelioration proposals to both The Mystical Friend (mystend(at)yahoo(point)com) and myself,
I'll publish if worth..
#!/usr/local/bin/perl -I. ## -I. stands for Include . (you understand later why)
## This is a very special kind of a Robot. You can start
## ---------------------------------------------------------------
## M I R B O T 1. 0
## ---------------------------------------------------------------
## this one from a free cgi-bin service like virtualave or netfirms remotly
## with a browser. You need only a start_cgi script, which opens a new shell
## e.g. #!/usr/bin/perl -I.
## system(./bot.cgi?xyz); #xyz stands for the parameters
## So and here is how it works:
## You give the bot an URL (e.g. http://129.105.116.5/fravia/index.htm),
## Username and Password for a FTP-Account, and an E-mail Adress for statisical
## reasons. First the bot downloads the index.htm, then he parses the doc. and
## after that, he loads the docoment up to the Ftp Server. This can be
## used for automatical mirroring. When the bot has finished, he sends you an
## E-mail (over sendmail). This Mail tells you about the errors, how many and
## which kind of files has been uploaded.
## Installation: Upload all standard perl modules into the cgi-bin Directory,
## then upload the bot, and finally upload a little File and rename it to "live"
## (read the source and the comments why).
##
## Hmm the bot worked fine after programming; He is tested
## well, and no bugs has been found.
## Tips for newbies: Learn Perl, and you can write bots like this one.
##
## The Mystical Friend [email protected]
###########################################################################
#Include of packages
@INC = ($INC[0], $INC[1], $INC[2]);
# use CGI::Carp qw(fatalsToBrowser); # uncomment this, if the bot causes an Internal Server Error 500.
use strict;
use vars qw($FtpServer $Username $Password $Site2mirror $Email);
use LWP::RobotUA;
use URI::URL;
use HTML::TreeBuilder;
use Net::FTP;
use File::Basename;
print "Content-type: text/html\n\n
";
###########################################################################
#Start of main routine
package main;
$| = 1;
print "\n\nScript $0 successfully initialized.\n\n";
print "Content-type: text/plain\nPragma: no-cache\n";
print '_' x 100;
print "@INC \n"; #only for debugging reasons, just to show you the Include pathes. If you are using the bot on a free service, you have to upload the whole modules-directroy to get it work
#($FtpServer, $Username, $Password, $Site2mirror, $Email) = GetFormData(); # you can use this, if you want to start the bot with a HTML
##page (POST!). This works with URL-Coding.
## e.g. bot.cgi?FtpServer .....
($FtpServer, $Username, $Password, $Site2mirror, $Email) = ("ftp.server", "username", "password", "site2mirror", "email\@adress.com");
my $domain = URI::URL -> new ($Site2mirror) -> host();
###########################################################################
#Initialization of FTP-connection
#print "\n\nInitializing FTP-connection to $FtpServer ..... ";
my $FtpAccess = Net::FTP -> new($FtpServer, Timeout => 200) || print
"ERROR while connecting to $FtpServer, $!";
$FtpAccess->login($Username, $Password) || print "LOGIN ERROR";
$FtpAccess->cwd("public_html") || print "Directory ERROR";
my @filelist = $FtpAccess -> dir();
print "DONE \n"; t
###########################################################################
#Initialization of User Agent as RobotUA
print "Initializing robot ...... ";
my $MirBo = new LWP::RobotUA ("Mirror-Bot\/1\.0", "MirBo\@hotmail\.com"); # Change Mirror-Bot\/1/.0 to something like Mozilla
$MirBo -> delay(0,15);
print "DONE. \n";
###########################################################################
#Initialization of global variables
#my %SEEN = (); # For Hash mode. But I think an Array is better.
my @URLs = ($Site2mirror);
my @SEEN = ($URLs[0]);
# $SEEN{$URLs[0]} = $URLs[0];
my ($TotalRequests, $HTMLDocs, $Binaries, $Uploads, $TotalBytes, $Errors);
print "\nStart main recursive routine ..... \n\n";
recursiv ();
print "\nMain routine finished.\n\n";
###########################################################################
#Some statistics
$Errors = $TotalRequests - ($HTMLDocs + $Binaries);
Sendmail ("$TotalRequests requests done \(Binaries: $Binaries, Docs: $HTMLDocs, Errors: $Errors\)\n\n $Uploads files retrieved and uploaded. \($TotalBytes bytes\).\n\n");
###########################################################################
#Program exit
print "Exit.... \n\n";
$FtpAccess -> quit();
exit 0;
###########################################################################
#Begin of main routine
sub recursiv {
foreach (@URLs) {
open(LIVE, "header("Content-Type"), $response->header("Content-Length"));
my $unchanged = grep {/$cl.{1,}$file/s} @filelist;
print "\t$URL is being requested..... ";
# debugging starts here
# open(ERRORF, ">>errors") || print "Cannot make a log-File";
# print ERRORF $response->code();
# close(ERRORF); #debugging ends here
if ($response->is_success() && $ct =~ "text\/html" || $ct =~ "text\/plain") {
$HTMLDocs++;
print "SUCCESSFUL. \n";
my $response = SendRequest ("GET", $URL);
my $doc = LinxParser ($response->content(), $URL);
SaveDoc ($file, $doc);
FTPUpload ("ascii", $file, $cl);
}
elsif ($response->is_success() && $ct !~ "text\/html" && $ct !~ "text\/plain") {
$Binaries++;
print "SUCCESSFUL. \n";
unless ($unchanged) {
my $response = SendRequest ("GET", $URL, $file);
FTPUpload ("binary", $file, $cl);
} else {
print "\t\t> We already have it!\n\n"; }
}
elsif ($response->is_error()) {
print "ERROR. \n";
print "\t\t> File not found!\n\n";
}
}
}
###########################################################################
#GetFilename routine
sub GetFilename {
my $URL = shift;
my $path = URI::URL -> new ($URL) -> path();
$path =~ s,/$,/index.htm,g;
$path =~ s,^/,,g;
my $file = basename($path);
return $file;
}
###########################################################################
#FTPUpload routine
sub FTPUpload {
my ($method, $file, $bytes) = @_;
print "\t\t> Uploading $file via FTP to $FtpServer ..... ";
$FtpAccess -> $method();
$FtpAccess -> put ($file, $file) || print "ERROR while trying to upload $file";
unlink $file;
print "DONE. \n\n";
$TotalBytes += $bytes;
$Uploads++;
}
###########################################################################
#SaveDoc routine
sub SaveDoc {
my ($file, $doc) = @_;
print "\t\t> Saving HTML-DOC $file ..... ";
open (FILE, ">$file");
print FILE "$doc";
close (FILE);
print "DONE. \n";
}
###########################################################################
#SendRequest routine
sub SendRequest {
$TotalRequests++;
my ($type, $URL, $file) = @_;
my $req = HTTP::Request->new($type,$URL);
my $res = $MirBo->request($req, $file);
return $res;
}
###########################################################################
#Linx-parser routine
sub LinxParser {
my ($doc, $URL) = @_;
print "\t\t> Substituting absolute links ..... ";
my $tree = HTML::TreeBuilder->new->parse($doc);
for (@{$tree->extract_links(qw/a area img/)}) {
my $l = URI::URL->new($_->[0]);
unless ($l =~ /javascript:/ || $l =~ /mailto:/) {
(my $s = $l->abs($URL)) =~ s/#.*//g;
my $host = URI::URL -> new ($s) -> host;
if ($host eq $domain) {
# my @ALREADYSEEN = $SEEN {$s}; # this is for the Hash-Mode.
my @ALREADYSEEN = grep { /$s/ } @SEEN;
push (@URLs, $s) unless @ALREADYSEEN;
push (@SEEN, $s);
# $SEEN {$s} = $s; # Hash-Mode
my $linkfile = GetFilename($l);
if ($l =~ /:\/\// || $l =~ /www./) {
$doc =~ s/\=\"$l/\=\"$linkfile/;
}
}
}
}
$tree->delete;
print "DONE. \n";
return $doc;
}
###########################################################################
#Sendmail routine
sub Sendmail {
my $BODY = shift;
open(SENDMAIL, "|/usr/sbin/sendmail -t") || print "ERROR while opening sendmail\n";
print SENDMAIL "From: Bill Gate\$ \n"; # Of course you can change this to what you want
print SENDMAIL "To: Mirbot <$Email>\n";
print SENDMAIL "Subject: Re:Status report\n\n";
print SENDMAIL "$BODY";
close(SENDMAIL);
}
###########################################################################
#Get_form_data routine
sub GetFormData {
my ($buffer, @pairs, @FORM, $name, $value) = ();
$buffer = $ENV{'QUERY_STRING'};
@pairs = split(/&/, $buffer);
foreach (@pairs) {
my ($name, $value) = split(/=/, $_);
$value =~ tr/+/ /;
$value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
$value =~ s///g;
push (@FORM, $value);
}
return @FORM;
}
#The end
###########################################################################