windows-server-2003/tools/perl/site/lib/www/robotrules.pm


								# $Id: RobotRules.pm,v 1.21 2000/04/07 20:17:54 gisle Exp $


								package WWW::RobotRules;


								=head1 NAME


								WWW::RobotsRules - Parse robots.txt files


								=head1 SYNOPSIS


								 require WWW::RobotRules;

								 my $robotsrules = new WWW::RobotRules 'MOMspider/1.0';


								 use LWP::Simple qw(get);


								 $url = "http://some.place/robots.txt";

								 my $robots_txt = get $url;

								 $robotsrules->parse($url, $robots_txt);


								 $url = "http://some.other.place/robots.txt";

								 my $robots_txt = get $url;

								 $robotsrules->parse($url, $robots_txt);


								 # Now we are able to check if a URL is valid for those servers that

								 # we have obtained and parsed "robots.txt" files for.

								 if($robotsrules->allowed($url)) {

								     $c = get $url;

								     ...

								 }


								=head1 DESCRIPTION


								This module parses a F</robots.txt> file as specified in

								"A Standard for Robot Exclusion", described in

								<http://info.webcrawler.com/mak/projects/robots/norobots.html>

								Webmasters can use the F</robots.txt> file to disallow conforming

								robots access to parts of their web site.


								The parsed file is kept in the WWW::RobotRules object, and this object

								provides methods to check if access to a given URL is prohibited.  The

								same WWW::RobotRules object can parse multiple F</robots.txt> files.


								The following methods are provided:


								=over 4


								=cut


								$VERSION = sprintf("%d.%02d", q$Revision: 1.21 $ =~ /(\d+)\.(\d+)/);

								sub Version { $VERSION; }


								use strict;

								use URI ();


								=item $rules = WWW::RobotRules->new($robot_name)


								This is the constructor for WWW::RobotRules objects.  The first

								argument given to new() is the name of the robot.


								=cut


								sub new {

								    my($class, $ua) = @_;


								    # This ugly hack is needed to ensure backwards compatability.

								    # The "WWW::RobotRules" class is now really abstract.

								    $class = "WWW::RobotRules::InCore" if $class eq "WWW::RobotRules";


								    my $self = bless { }, $class;

								    $self->agent($ua);

								    $self;

								}


								=item $rules->parse($robot_txt_url, $content, $fresh_until)


								The parse() method takes as arguments the URL that was used to

								retrieve the F</robots.txt> file, and the contents of the file.


								=cut


								sub parse {

								    my($self, $robot_txt_uri, $txt, $fresh_until) = @_;

								    $robot_txt_uri = URI->new("$robot_txt_uri");

								    my $netloc = $robot_txt_uri->authority;


								    $self->clear_rules($netloc);

								    $self->fresh_until($netloc, $fresh_until || (time + 365*24*3600));


								    my $ua;

								    my $is_me = 0;		# 1 iff this record is for me

								    my $is_anon = 0;		# 1 iff this record is for *

								    my @me_disallowed = ();	# rules disallowed for me

								    my @anon_disallowed = ();	# rules disallowed for *


								    # blank lines are significant, so turn CRLF into LF to avoid generating

								    # false ones

								    $txt =~ s/\015\012/\012/g;


								    # split at \012 (LF) or \015 (CR) (Mac text files have just CR for EOL)

								    for(split(/[\012\015]/, $txt)) {


									# Lines containing only a comment are discarded completely, and

								        # therefore do not indicate a record boundary.

									next if /^\s*\#/;


									s/\s*\#.*//;        # remove comments at end-of-line


									if (/^\s*$/) {	    # blank line

									    last if $is_me; # That was our record. No need to read the rest.

									    $is_anon = 0;

									}

								        elsif (/^User-Agent:\s*(.*)/i) {

									    $ua = $1;

									    $ua =~ s/\s+$//;

									    if ($is_me) {

										# This record already had a User-agent that

										# we matched, so just continue.

									    }

									    elsif ($ua eq '*') {

										$is_anon = 1;

									    }

									    elsif($self->is_me($ua)) {

										$is_me = 1;

									    }

									}

									elsif (/^Disallow:\s*(.*)/i) {

									    unless (defined $ua) {

										warn "RobotRules: Disallow without preceding User-agent\n";

										$is_anon = 1;  # assume that User-agent: * was intended

									    }

									    my $disallow = $1;

									    $disallow =~ s/\s+$//;

									    if (length $disallow) {

										my $ignore;

										eval {

										    my $u = URI->new_abs($disallow, $robot_txt_uri);

										    $ignore++ if $u->scheme ne $robot_txt_uri->scheme;

										    $ignore++ if lc($u->host) ne lc($robot_txt_uri->host);

										    $ignore++ if $u->port ne $robot_txt_uri->port;

										    $disallow = $u->path_query;

										    $disallow = "/" unless length $disallow;

										};

										next if $@;

										next if $ignore;

									    }


									    if ($is_me) {

										push(@me_disallowed, $disallow);

									    }

									    elsif ($is_anon) {

										push(@anon_disallowed, $disallow);

									    }

									}

									else {

									    warn "RobotRules: Unexpected line: $_\n";

									}

								    }


								    if ($is_me) {

									$self->push_rules($netloc, @me_disallowed);

								    } else {

									$self->push_rules($netloc, @anon_disallowed);

								    }

								}


								# is_me()

								#

								# Returns TRUE if the given name matches the

								# name of this robot

								#

								sub is_me {

								    my($self, $ua) = @_;

								    my $me = $self->agent;

								    return index(lc($ua), lc($me)) >= 0;

								}


								=item $rules->allowed($uri)


								Returns TRUE if this robot is allowed to retrieve this URL.


								=cut


								sub allowed {

								    my($self, $uri) = @_;

								    $uri = URI->new("$uri");

								    my $netloc = $uri->authority;


								    my $fresh_until = $self->fresh_until($netloc);

								    return -1 if !defined($fresh_until) || $fresh_until < time;


								    my $str = $uri->path_query;

								    my $rule;

								    for $rule ($self->rules($netloc)) {

									return 1 unless length $rule;

									return 0 if index($str, $rule) == 0;

								    }

								    return 1;

								}


								# The following methods must be provided by the subclass.

								sub agent;

								sub visit;

								sub no_visits;

								sub last_visits;

								sub fresh_until;

								sub push_rules;

								sub clear_rules;

								sub rules;

								sub dump;


								package WWW::RobotRules::InCore;


								use vars qw(@ISA);

								@ISA = qw(WWW::RobotRules);


								=item $rules->agent([$name])


								Get/set the agent name. NOTE: Changing the agent name will clear the robots.txt

								rules and expire times out of the cache.


								=cut


								sub agent {

								    my ($self, $name) = @_;

								    my $old = $self->{'ua'};

								    if ($name) {

									delete $self->{'loc'};   # all old info is now stale

									$name =~ s!/?\s*\d+.\d+\s*$!!;  # loose version

									$self->{'ua'}=$name;

								    }

								    $old;

								}


								sub visit {

								    my($self, $netloc, $time) = @_;

								    $time ||= time;

								    $self->{'loc'}{$netloc}{'last'} = $time;

								    my $count = \$self->{'loc'}{$netloc}{'count'};

								    if (!defined $$count) {

									$$count = 1;

								    } else {

									$$count++;

								    }

								}


								sub no_visits {

								    my ($self, $netloc) = @_;

								    $self->{'loc'}{$netloc}{'count'};

								}


								sub last_visit {

								    my ($self, $netloc) = @_;

								    $self->{'loc'}{$netloc}{'last'};

								}


								sub fresh_until {

								    my ($self, $netloc, $fresh_until) = @_;

								    my $old = $self->{'loc'}{$netloc}{'fresh'};

								    if (defined $fresh_until) {

									$self->{'loc'}{$netloc}{'fresh'} = $fresh_until;

								    }

								    $old;

								}


								sub push_rules {

								    my($self, $netloc, @rules) = @_;

								    push (@{$self->{'loc'}{$netloc}{'rules'}}, @rules);

								}


								sub clear_rules {

								    my($self, $netloc) = @_;

								    delete $self->{'loc'}{$netloc}{'rules'};

								}


								sub rules {

								    my($self, $netloc) = @_;

								    if (defined $self->{'loc'}{$netloc}{'rules'}) {

									return @{$self->{'loc'}{$netloc}{'rules'}};

								    } else {

									return ();

								    }

								}


								sub dump

								{

								    my $self = shift;

								    for (keys %$self) {

									next if $_ eq 'loc';

									print "$_ = $self->{$_}\n";

								    }

								    for (keys %{$self->{'loc'}}) {

									my @rules = $self->rules($_);

									print "$_: ", join("; ", @rules), "\n";

								    }

								}


								1;


								__END__


								=back


								=head1 ROBOTS.TXT


								The format and semantics of the "/robots.txt" file are as follows

								(this is an edited abstract of

								<http://info.webcrawler.com/mak/projects/robots/norobots.html>):


								The file consists of one or more records separated by one or more

								blank lines. Each record contains lines of the form


								  <field-name>: <value>


								The field name is case insensitive.  Text after the '#' character on a

								line is ignored during parsing.  This is used for comments.  The

								following <field-names> can be used:


								=over 3


								=item User-Agent


								The value of this field is the name of the robot the record is

								describing access policy for.  If more than one I<User-Agent> field is

								present the record describes an identical access policy for more than

								one robot. At least one field needs to be present per record.  If the

								value is '*', the record describes the default access policy for any

								robot that has not not matched any of the other records.


								=item Disallow


								The value of this field specifies a partial URL that is not to be

								visited. This can be a full path, or a partial path; any URL that

								starts with this value will not be retrieved


								=back


								=head1 ROBOTS.TXT EXAMPLES


								The following example "/robots.txt" file specifies that no robots

								should visit any URL starting with "/cyberworld/map/" or "/tmp/":


								  User-agent: *

								  Disallow: /cyberworld/map/ # This is an infinite virtual URL space

								  Disallow: /tmp/ # these will soon disappear


								This example "/robots.txt" file specifies that no robots should visit

								any URL starting with "/cyberworld/map/", except the robot called

								"cybermapper":


								  User-agent: *

								  Disallow: /cyberworld/map/ # This is an infinite virtual URL space


								  # Cybermapper knows where to go.

								  User-agent: cybermapper

								  Disallow:


								This example indicates that no robots should visit this site further:


								  # go away

								  User-agent: *

								  Disallow: /


								=head1 SEE ALSO


								L<LWP::RobotUA>, L<WWW::RobotRules::AnyDBM_File>


								=cut