Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

369 lines
9.0 KiB

  1. # $Id: RobotRules.pm,v 1.21 2000/04/07 20:17:54 gisle Exp $
  2. package WWW::RobotRules;
  3. =head1 NAME
  4. WWW::RobotsRules - Parse robots.txt files
  5. =head1 SYNOPSIS
  6. require WWW::RobotRules;
  7. my $robotsrules = new WWW::RobotRules 'MOMspider/1.0';
  8. use LWP::Simple qw(get);
  9. $url = "http://some.place/robots.txt";
  10. my $robots_txt = get $url;
  11. $robotsrules->parse($url, $robots_txt);
  12. $url = "http://some.other.place/robots.txt";
  13. my $robots_txt = get $url;
  14. $robotsrules->parse($url, $robots_txt);
  15. # Now we are able to check if a URL is valid for those servers that
  16. # we have obtained and parsed "robots.txt" files for.
  17. if($robotsrules->allowed($url)) {
  18. $c = get $url;
  19. ...
  20. }
  21. =head1 DESCRIPTION
  22. This module parses a F</robots.txt> file as specified in
  23. "A Standard for Robot Exclusion", described in
  24. <http://info.webcrawler.com/mak/projects/robots/norobots.html>
  25. Webmasters can use the F</robots.txt> file to disallow conforming
  26. robots access to parts of their web site.
  27. The parsed file is kept in the WWW::RobotRules object, and this object
  28. provides methods to check if access to a given URL is prohibited. The
  29. same WWW::RobotRules object can parse multiple F</robots.txt> files.
  30. The following methods are provided:
  31. =over 4
  32. =cut
  33. $VERSION = sprintf("%d.%02d", q$Revision: 1.21 $ =~ /(\d+)\.(\d+)/);
  34. sub Version { $VERSION; }
  35. use strict;
  36. use URI ();
  37. =item $rules = WWW::RobotRules->new($robot_name)
  38. This is the constructor for WWW::RobotRules objects. The first
  39. argument given to new() is the name of the robot.
  40. =cut
  41. sub new {
  42. my($class, $ua) = @_;
  43. # This ugly hack is needed to ensure backwards compatability.
  44. # The "WWW::RobotRules" class is now really abstract.
  45. $class = "WWW::RobotRules::InCore" if $class eq "WWW::RobotRules";
  46. my $self = bless { }, $class;
  47. $self->agent($ua);
  48. $self;
  49. }
  50. =item $rules->parse($robot_txt_url, $content, $fresh_until)
  51. The parse() method takes as arguments the URL that was used to
  52. retrieve the F</robots.txt> file, and the contents of the file.
  53. =cut
  54. sub parse {
  55. my($self, $robot_txt_uri, $txt, $fresh_until) = @_;
  56. $robot_txt_uri = URI->new("$robot_txt_uri");
  57. my $netloc = $robot_txt_uri->authority;
  58. $self->clear_rules($netloc);
  59. $self->fresh_until($netloc, $fresh_until || (time + 365*24*3600));
  60. my $ua;
  61. my $is_me = 0; # 1 iff this record is for me
  62. my $is_anon = 0; # 1 iff this record is for *
  63. my @me_disallowed = (); # rules disallowed for me
  64. my @anon_disallowed = (); # rules disallowed for *
  65. # blank lines are significant, so turn CRLF into LF to avoid generating
  66. # false ones
  67. $txt =~ s/\015\012/\012/g;
  68. # split at \012 (LF) or \015 (CR) (Mac text files have just CR for EOL)
  69. for(split(/[\012\015]/, $txt)) {
  70. # Lines containing only a comment are discarded completely, and
  71. # therefore do not indicate a record boundary.
  72. next if /^\s*\#/;
  73. s/\s*\#.*//; # remove comments at end-of-line
  74. if (/^\s*$/) { # blank line
  75. last if $is_me; # That was our record. No need to read the rest.
  76. $is_anon = 0;
  77. }
  78. elsif (/^User-Agent:\s*(.*)/i) {
  79. $ua = $1;
  80. $ua =~ s/\s+$//;
  81. if ($is_me) {
  82. # This record already had a User-agent that
  83. # we matched, so just continue.
  84. }
  85. elsif ($ua eq '*') {
  86. $is_anon = 1;
  87. }
  88. elsif($self->is_me($ua)) {
  89. $is_me = 1;
  90. }
  91. }
  92. elsif (/^Disallow:\s*(.*)/i) {
  93. unless (defined $ua) {
  94. warn "RobotRules: Disallow without preceding User-agent\n";
  95. $is_anon = 1; # assume that User-agent: * was intended
  96. }
  97. my $disallow = $1;
  98. $disallow =~ s/\s+$//;
  99. if (length $disallow) {
  100. my $ignore;
  101. eval {
  102. my $u = URI->new_abs($disallow, $robot_txt_uri);
  103. $ignore++ if $u->scheme ne $robot_txt_uri->scheme;
  104. $ignore++ if lc($u->host) ne lc($robot_txt_uri->host);
  105. $ignore++ if $u->port ne $robot_txt_uri->port;
  106. $disallow = $u->path_query;
  107. $disallow = "/" unless length $disallow;
  108. };
  109. next if $@;
  110. next if $ignore;
  111. }
  112. if ($is_me) {
  113. push(@me_disallowed, $disallow);
  114. }
  115. elsif ($is_anon) {
  116. push(@anon_disallowed, $disallow);
  117. }
  118. }
  119. else {
  120. warn "RobotRules: Unexpected line: $_\n";
  121. }
  122. }
  123. if ($is_me) {
  124. $self->push_rules($netloc, @me_disallowed);
  125. } else {
  126. $self->push_rules($netloc, @anon_disallowed);
  127. }
  128. }
  129. # is_me()
  130. #
  131. # Returns TRUE if the given name matches the
  132. # name of this robot
  133. #
  134. sub is_me {
  135. my($self, $ua) = @_;
  136. my $me = $self->agent;
  137. return index(lc($ua), lc($me)) >= 0;
  138. }
  139. =item $rules->allowed($uri)
  140. Returns TRUE if this robot is allowed to retrieve this URL.
  141. =cut
  142. sub allowed {
  143. my($self, $uri) = @_;
  144. $uri = URI->new("$uri");
  145. my $netloc = $uri->authority;
  146. my $fresh_until = $self->fresh_until($netloc);
  147. return -1 if !defined($fresh_until) || $fresh_until < time;
  148. my $str = $uri->path_query;
  149. my $rule;
  150. for $rule ($self->rules($netloc)) {
  151. return 1 unless length $rule;
  152. return 0 if index($str, $rule) == 0;
  153. }
  154. return 1;
  155. }
  156. # The following methods must be provided by the subclass.
  157. sub agent;
  158. sub visit;
  159. sub no_visits;
  160. sub last_visits;
  161. sub fresh_until;
  162. sub push_rules;
  163. sub clear_rules;
  164. sub rules;
  165. sub dump;
  166. package WWW::RobotRules::InCore;
  167. use vars qw(@ISA);
  168. @ISA = qw(WWW::RobotRules);
  169. =item $rules->agent([$name])
  170. Get/set the agent name. NOTE: Changing the agent name will clear the robots.txt
  171. rules and expire times out of the cache.
  172. =cut
  173. sub agent {
  174. my ($self, $name) = @_;
  175. my $old = $self->{'ua'};
  176. if ($name) {
  177. delete $self->{'loc'}; # all old info is now stale
  178. $name =~ s!/?\s*\d+.\d+\s*$!!; # loose version
  179. $self->{'ua'}=$name;
  180. }
  181. $old;
  182. }
  183. sub visit {
  184. my($self, $netloc, $time) = @_;
  185. $time ||= time;
  186. $self->{'loc'}{$netloc}{'last'} = $time;
  187. my $count = \$self->{'loc'}{$netloc}{'count'};
  188. if (!defined $$count) {
  189. $$count = 1;
  190. } else {
  191. $$count++;
  192. }
  193. }
  194. sub no_visits {
  195. my ($self, $netloc) = @_;
  196. $self->{'loc'}{$netloc}{'count'};
  197. }
  198. sub last_visit {
  199. my ($self, $netloc) = @_;
  200. $self->{'loc'}{$netloc}{'last'};
  201. }
  202. sub fresh_until {
  203. my ($self, $netloc, $fresh_until) = @_;
  204. my $old = $self->{'loc'}{$netloc}{'fresh'};
  205. if (defined $fresh_until) {
  206. $self->{'loc'}{$netloc}{'fresh'} = $fresh_until;
  207. }
  208. $old;
  209. }
  210. sub push_rules {
  211. my($self, $netloc, @rules) = @_;
  212. push (@{$self->{'loc'}{$netloc}{'rules'}}, @rules);
  213. }
  214. sub clear_rules {
  215. my($self, $netloc) = @_;
  216. delete $self->{'loc'}{$netloc}{'rules'};
  217. }
  218. sub rules {
  219. my($self, $netloc) = @_;
  220. if (defined $self->{'loc'}{$netloc}{'rules'}) {
  221. return @{$self->{'loc'}{$netloc}{'rules'}};
  222. } else {
  223. return ();
  224. }
  225. }
  226. sub dump
  227. {
  228. my $self = shift;
  229. for (keys %$self) {
  230. next if $_ eq 'loc';
  231. print "$_ = $self->{$_}\n";
  232. }
  233. for (keys %{$self->{'loc'}}) {
  234. my @rules = $self->rules($_);
  235. print "$_: ", join("; ", @rules), "\n";
  236. }
  237. }
  238. 1;
  239. __END__
  240. =back
  241. =head1 ROBOTS.TXT
  242. The format and semantics of the "/robots.txt" file are as follows
  243. (this is an edited abstract of
  244. <http://info.webcrawler.com/mak/projects/robots/norobots.html>):
  245. The file consists of one or more records separated by one or more
  246. blank lines. Each record contains lines of the form
  247. <field-name>: <value>
  248. The field name is case insensitive. Text after the '#' character on a
  249. line is ignored during parsing. This is used for comments. The
  250. following <field-names> can be used:
  251. =over 3
  252. =item User-Agent
  253. The value of this field is the name of the robot the record is
  254. describing access policy for. If more than one I<User-Agent> field is
  255. present the record describes an identical access policy for more than
  256. one robot. At least one field needs to be present per record. If the
  257. value is '*', the record describes the default access policy for any
  258. robot that has not not matched any of the other records.
  259. =item Disallow
  260. The value of this field specifies a partial URL that is not to be
  261. visited. This can be a full path, or a partial path; any URL that
  262. starts with this value will not be retrieved
  263. =back
  264. =head1 ROBOTS.TXT EXAMPLES
  265. The following example "/robots.txt" file specifies that no robots
  266. should visit any URL starting with "/cyberworld/map/" or "/tmp/":
  267. User-agent: *
  268. Disallow: /cyberworld/map/ # This is an infinite virtual URL space
  269. Disallow: /tmp/ # these will soon disappear
  270. This example "/robots.txt" file specifies that no robots should visit
  271. any URL starting with "/cyberworld/map/", except the robot called
  272. "cybermapper":
  273. User-agent: *
  274. Disallow: /cyberworld/map/ # This is an infinite virtual URL space
  275. # Cybermapper knows where to go.
  276. User-agent: cybermapper
  277. Disallow:
  278. This example indicates that no robots should visit this site further:
  279. # go away
  280. User-agent: *
  281. Disallow: /
  282. =head1 SEE ALSO
  283. L<LWP::RobotUA>, L<WWW::RobotRules::AnyDBM_File>
  284. =cut