Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

916 lines
27 KiB

  1. @rem = '--*-Perl-*--
  2. @echo off
  3. if "%OS%" == "Windows_NT" goto WinNT
  4. perl -x -S "%0" %1 %2 %3 %4 %5 %6 %7 %8 %9
  5. goto endofperl
  6. :WinNT
  7. perl -x -S "%0" %*
  8. if NOT "%COMSPEC%" == "%SystemRoot%\system32\cmd.exe" goto endofperl
  9. if %errorlevel% == 9009 echo You do not have Perl in your PATH.
  10. goto endofperl
  11. @rem ';
  12. #!perl
  13. #line 14
  14. ##
  15. ## Jeffrey Friedl ([email protected])
  16. ## Copyri.... ah hell, just take it.
  17. ##
  18. ## This is "www.pl".
  19. ## Include (require) to use, execute ("perl www.pl") to print a man page.
  20. ## Requires my 'network.pl' library.
  21. package www;
  22. $version = "951219.9";
  23. ##
  24. ## 951219.9
  25. ## -- oops, stopped sending garbage Authorization line when no
  26. ## authorization was requested.
  27. ##
  28. ## 951114.8
  29. ## -- added support for HEAD, If-Modified-Since
  30. ##
  31. ## 951017.7
  32. ## -- Change to allow a POST'ed HTTP text to have newlines in it.
  33. ## Added 'NewURL to the open_http_connection %info. Idea courtesy
  34. ## of Bryan Schmersal (http://www.transarc.com/~bryans/Home.html).
  35. ##
  36. ##
  37. ## 950921.6
  38. ## -- added more robust HTTP error reporting
  39. ## (due to [email protected])
  40. ##
  41. ## 950911.5
  42. ## -- added Authorization support
  43. ##
  44. ##
  45. ## HTTP return status codes.
  46. ##
  47. %http_return_code =
  48. (200,"OK",
  49. 201,"Created",
  50. 202,"Accepted",
  51. 203,"Partial Information",
  52. 204,"No Response",
  53. 301,"Moved",
  54. 302,"Found",
  55. 303,"Method",
  56. 304,"Not modified",
  57. 400,"Bad request",
  58. 401,"Unauthorized",
  59. 402,"Payment required",
  60. 403,"Forbidden",
  61. 404,"Not found",
  62. 500,"Internal error",
  63. 501,"Not implemented",
  64. 502,"Service temporarily overloaded",
  65. 503,"Gateway timeout");
  66. ##
  67. ## If executed directly as a program, print as a man page.
  68. ##
  69. if (length($0) >= 6 && substr($0, -6) eq 'www.pl')
  70. {
  71. seek(DATA, 0, 0) || die "$0: can't reset internal pointer.\n";
  72. print "www.pl version $version\n", '=' x 60, "\n";
  73. while (<DATA>) {
  74. next unless /^##>/../^##</; ## select lines to print
  75. s/^##[<> ]?//; ## clean up
  76. print;
  77. }
  78. exit(0);
  79. }
  80. ##
  81. ## History:
  82. ## version 950425.4
  83. ## added require for "network.pl"
  84. ##
  85. ## version 950425.3
  86. ## re-did from "Www.pl" which was a POS.
  87. ##
  88. ##
  89. ## BLURB:
  90. ## A group of routines for dealing with URLs, HTTP sessions, proxies, etc.
  91. ## Requires my 'network.pl' package. The library file can be executed
  92. ## directly to produce a man page.
  93. ##>
  94. ## A motley group of routines for dealing with URLs, HTTP sessions, proxies,
  95. ## etc. Requires my 'network.pl' package.
  96. ##
  97. ## Latest version, as well as other stuff (including network.pl) available
  98. ## at http://www.wg.omron.co.jp/~jfriedl/perl/
  99. ##
  100. ## Simpleton complete program to dump a URL given on the command-line:
  101. ##
  102. ## require 'network.pl'; ## required for www.pl
  103. ## require 'www.pl'; ## main routines
  104. ## $URL = shift; ## get URL
  105. ## ($status, $memo) = &www'open_http_url(*IN, $URL); ## connect
  106. ## die "$memo\n" if $status ne 'ok'; ## report any error
  107. ## print while <IN>; ## dump contents
  108. ##
  109. ## There are various options available for open_http_url.
  110. ## For example, adding 'quiet' to the call, i.e. vvvvvvv-----added
  111. ## ($status, $memo) = &www'open_http_url(*IN, $URL, 'quiet');
  112. ## suppresses the normal informational messages such as "waiting for data...".
  113. ##
  114. ## The options, as well as the various other public routines in the package,
  115. ## are discussed below.
  116. ##
  117. ##<
  118. ##
  119. ## Default port for the protocols whose URL we'll at least try to recognize.
  120. ##
  121. %default_port = ('http', 80,
  122. 'ftp', 21,
  123. 'gopher', 70,
  124. 'telnet', 23,
  125. 'wais', 210,
  126. );
  127. ##
  128. ## A "URL" to "ftp.blah.com" without a protocol specified is probably
  129. ## best reached via ftp. If the hostname begins with a protocol name, it's
  130. ## easy. But something like "www." maps to "http", so that mapping is below:
  131. ##
  132. %name2protocol = (
  133. 'www', 'http',
  134. 'wwwcgi','http',
  135. );
  136. $last_message_length = 0;
  137. $useragent = "www.pl/$version";
  138. ##
  139. ##>
  140. ##############################################################################
  141. ## routine: open_http_url
  142. ##
  143. ## Used as
  144. ## ($status, $memo, %info) = &www'open_http_url(*FILEHANDLE, $URL, options..)
  145. ##
  146. ## Given an unused filehandle, a URL, and a list of options, opens a socket
  147. ## to the URL and returns with the filehandle ready to read the data of the
  148. ## URL. The HTTP header, as well as other information, is returned in %info.
  149. ##
  150. ## OPTIONS are from among:
  151. ##
  152. ## "post"
  153. ## If PATH appears to be a query (i.e. has a ? in it), contact
  154. ## via a POST rather than a GET.
  155. ##
  156. ## "nofollow"
  157. ## Normally, if the initial contact indicates that the URL has moved
  158. ## to a different location, the new location is automatically contacted.
  159. ## "nofollow" inhibits this.
  160. ##
  161. ## "noproxy"
  162. ## Normally, a proxy will be used if 'http_proxy' is defined in the
  163. ## environment. This option inhibits the use of a proxy.
  164. ##
  165. ## "retry"
  166. ## If a host's address can't be found, it may well be because the
  167. ## nslookup just didn't return in time and that retrying the lookup
  168. ## after a few seconds will succeed. If this option is given, will
  169. ## wait five seconds and try again. May be given multiple times to
  170. ## retry multiple times.
  171. ##
  172. ## "quiet"
  173. ## Informational messages will be suppressed.
  174. ##
  175. ## "debug"
  176. ## Additional messages will be printed.
  177. ##
  178. ## "head"
  179. ## Requests only the file header to be sent
  180. ##
  181. ##
  182. ##
  183. ##
  184. ## The return array is ($STATUS, $MEMO, %INFO).
  185. ##
  186. ## STATUS is 'ok', 'error', 'status', or 'follow'
  187. ##
  188. ## If 'error', the MEMO will indicate why (URL was not http, can't
  189. ## connect, etc.). INFO is probably empty, but may have some data.
  190. ## See below.
  191. ##
  192. ## If 'status', the connnection was made but the reply was not a normal
  193. ## "OK" successful reply (i.e. "Not found", etc.). MEMO is a note.
  194. ## INFO is filled as noted below. Filehandle is ready to read (unless
  195. ## $info{'BODY'} is filled -- see below), but probably most useful
  196. ## to treat this as an 'error' response.
  197. ##
  198. ## If 'follow', MEMO is the new URL (for when 'nofollow' was used to
  199. ## turn off automatic following) and INFO is filled as described
  200. ## below. Unless you wish to give special treatment to these types of
  201. ## responses, you can just treat 'follow' responses like 'ok'
  202. ## responses.
  203. ##
  204. ## If 'ok', the connection went well and the filehandle is ready to
  205. ## read.
  206. ##
  207. ## INFO contains data as described at the read_http_header() function (in
  208. ## short, the HTTP response header) and additional informational fields.
  209. ## In addition, the following fields are filled in which describe the raw
  210. ## connection made or attempted:
  211. ##
  212. ## PROTOCOL, HOST, PORT, PATH
  213. ##
  214. ## Note that if a proxy is being used, these will describe the proxy.
  215. ## The field TARGET will describe the host or host:port ultimately being
  216. ## contacted. When no proxy is being used, this will be the same info as
  217. ## in the raw connection fields above. However, if a proxy is being used,
  218. ## it will refer to the final target.
  219. ##
  220. ## In some cases, the additional entry $info{'BODY'} exists as well. If
  221. ## the result-code indicates an error, the body of the message may be
  222. ## parsed for internal reasons (i.e. to support 'repeat'), and if so, it
  223. ## will be saved in $info{'BODY}.
  224. ##
  225. ## If the URL has moved, $info{'NewURL'} will exist and contain the new
  226. ## URL. This will be true even if the 'nofollow' option is specified.
  227. ##
  228. ##<
  229. ##
  230. sub open_http_url
  231. {
  232. local(*HTTP, $URL, @options) = @_;
  233. return &open_http_connection(*HTTP, $URL, undef, undef, undef, @options);
  234. }
  235. ##
  236. ##>
  237. ##############################################################################
  238. ## routine: read_http_header
  239. ##
  240. ## Given a filehandle to a just-opened HTTP socket connection (such as one
  241. ## created via &network'connect_to which has had the HTTP request sent),
  242. ## reads the HTTP header and and returns the parsed info.
  243. ##
  244. ## ($replycode, %info) = &read_http_header(*FILEHANDLE);
  245. ##
  246. ## $replycode will be the HTTP reply code as described below, or
  247. ## zero on header-read error.
  248. ##
  249. ## %info contains two types of fields:
  250. ##
  251. ## Upper-case fields are informational from the function.
  252. ## Lower-case fields are the header field/value pairs.
  253. ##
  254. ## Upper-case fields:
  255. ##
  256. ## $info{'STATUS'} will be the first line read (HTTP status line)
  257. ##
  258. ## $info{'CODE'} will be the numeric HTTP reply code from that line.
  259. ## This is also returned as $replycode.
  260. ##
  261. ## $info{'TYPE'} is the text from the status line that follows CODE.
  262. ##
  263. ## $info{'HEADER'} will be the raw text of the header (sans status line),
  264. ## newlines and all.
  265. ##
  266. ## $info{'UNKNOWN'}, if defined, will be any header lines not in the
  267. ## field/value format used to fill the lower-case fields of %info.
  268. ##
  269. ## Lower-case fields are reply-dependent, but in general are described
  270. ## in http://info.cern.ch/hypertext/WWW/Protocols/HTTP/Object_Headers.html
  271. ##
  272. ## A header line such as
  273. ## Content-type: Text/Plain
  274. ## will appear as $info{'content-type'} = 'Text/Plain';
  275. ##
  276. ## (*) Note that while the field names are are lower-cased, the field
  277. ## values are left as-is.
  278. ##
  279. ##
  280. ## When $replycode is zero, there are two possibilities:
  281. ## $info{'TYPE'} is 'empty'
  282. ## No response was received from the filehandle before it was closed.
  283. ## No other %info fields present.
  284. ## $info{'TYPE'} is 'unknown'
  285. ## First line of the response doesn't seem to be proper HTTP.
  286. ## $info{'STATUS'} holds that line. No other %info fields present.
  287. ##
  288. ## The $replycode, when not zero, is as described at
  289. ## http://info.cern.ch/hypertext/WWW/Protocols/HTTP/HTRESP.html
  290. ##
  291. ## Some of the codes:
  292. ##
  293. ## success 2xx
  294. ## ok 200
  295. ## created 201
  296. ## accepted 202
  297. ## partial information 203
  298. ## no response 204
  299. ## redirection 3xx
  300. ## moved 301
  301. ## found 302
  302. ## method 303
  303. ## not modified 304
  304. ## error 4xx, 5xx
  305. ## bad request 400
  306. ## unauthorized 401
  307. ## paymentrequired 402
  308. ## forbidden 403
  309. ## not found 404
  310. ## internal error 500
  311. ## not implemented 501
  312. ## service temporarily overloaded 502
  313. ## gateway timeout 503
  314. ##
  315. ##<
  316. ##
  317. sub read_http_header
  318. {
  319. local(*HTTP) = @_;
  320. local(%info, $_);
  321. ##
  322. ## The first line of the response will be the status (OK, error, etc.)
  323. ##
  324. unless (defined($info{'STATUS'} = <HTTP>)) {
  325. $info{'TYPE'} = "empty";
  326. return (0, %info);
  327. }
  328. chop $info{'STATUS'};
  329. ##
  330. ## Check the status line. If it doesn't match and we don't know the
  331. ## format, we'll just let it pass and hope for the best.
  332. ##
  333. unless ($info{'STATUS'} =~ m/^HTTP\S+\s+(\d\d\d)\s+(.*\S)/i) {
  334. $info{'TYPE'} = 'unknown';
  335. return (0, %info);
  336. }
  337. $info{'CODE'} = $1;
  338. $info{'TYPE'} = $2;
  339. $info{'HEADER'} = '';
  340. ## read the rest of the header.
  341. while (<HTTP>) {
  342. last if m/^\s*$/;
  343. $info{'HEADER'} .= $_; ## save whole text of header.
  344. if (m/^([^\n:]+):[ \t]*(.*\S)/) {
  345. local($field, $value) = ("\L$1", $2);
  346. if (defined $info{$field}) {
  347. $info{$field} .= "\n" . $value;
  348. } else {
  349. $info{$field} = $value;
  350. }
  351. } elsif (defined $info{'UNKNOWN'}) {
  352. $info{'UNKNOWN'} .= $_;
  353. } else {
  354. $info{'UNKNOWN'} = $_;
  355. }
  356. }
  357. return ($info{'CODE'}, %info);
  358. }
  359. ##
  360. ##>
  361. ##
  362. ##############################################################################
  363. ## routine: grok_URL(URL, noproxy, defaultprotocol)
  364. ##
  365. ## Given a URL, returns access information. Deals with
  366. ## http, wais, gopher, ftp, and telnet
  367. ## URLs.
  368. ##
  369. ## Information returned is
  370. ## (PROTOCOL, HOST, PORT, PATH, TARGET, USER, PASSWORD)
  371. ##
  372. ## If noproxy is not given (or false) and there is a proxy defined
  373. ## for the given protocol (via the "*_proxy" environmental variable),
  374. ## the returned access information will be for the proxy and will
  375. ## reference the given URL. In this case, 'TARGET' will be the
  376. ## HOST:PORT of the original URL (PORT elided if it's the default port).
  377. ##
  378. ## Access information returned:
  379. ## PROTOCOL: "http", "ftp", etc. (guaranteed to be lowercase).
  380. ## HOST: hostname or address as given.
  381. ## PORT: port to access
  382. ## PATH: path of resource on HOST:PORT.
  383. ## TARGET: (see above)
  384. ## USER and PASSWORD: for 'ftp' and 'telnet' URLs, if supplied by the
  385. ## URL these will be defined, undefined otherwise.
  386. ##
  387. ## If no protocol is defined via the URL, the defaultprotocol will be used
  388. ## if given. Otherwise, the URL's address will be checked for a leading
  389. ## protocol name (as with a leading "www.") and if found will be used.
  390. ## Otherwise, the protocol defaults to http.
  391. ##
  392. ## Fills in the appropriate default port for the protocol if need be.
  393. ##
  394. ## A proxy is defined by a per-protocol environmental variable such
  395. ## as http_proxy. For example, you might have
  396. ## setenv http_proxy http://firewall:8080/
  397. ## setenv ftp_proxy $http_proxy
  398. ## to set it up.
  399. ##
  400. ## A URL seems to be officially described at
  401. ## http://www.w3.org/hypertext/WWW/Addressing/URL/5_BNF.html
  402. ## although that document is a joke of errors.
  403. ##
  404. ##<
  405. ##
  406. sub grok_URL
  407. {
  408. local($_, $noproxy, $defaultprotocol) = @_;
  409. $noproxy = defined($noproxy) && $noproxy;
  410. ## Items to be filled in and returned.
  411. local($protocol, $address, $port, $path, $target, $user, $password);
  412. return undef unless m%^(([a-zA-Z]+)://|/*)([^/]+)(/.*)?$%;
  413. ##
  414. ## Due to a bug in some versions of perl5, $2 might not be empty
  415. ## even if $1 is. Therefore, we must check $1 for a : to see if the
  416. ## protocol stuff matched or not. If not, the protocol is undefined.
  417. ##
  418. ($protocol, $address, $path) = ((index($1,":") >= 0 ? $2 : undef), $3, $4);
  419. if (!defined $protocol)
  420. {
  421. ##
  422. ## Choose a default protocol if none given. If address begins with
  423. ## a protocol name (one that we know via %name2protocol or
  424. ## %default_port), choose it. Otherwise, choose http.
  425. ##
  426. if (defined $defaultprotocol) {
  427. $protocol = $defaultprotocol;
  428. }
  429. else
  430. {
  431. $address =~ m/^[a-zA-Z]+/;
  432. if (defined($name2protocol{"\L$&"})) {
  433. $protocol = $name2protocol{"\L$&"};
  434. } else {
  435. $protocol = defined($default_port{"\L$&"}) ? $& : 'http';
  436. }
  437. }
  438. }
  439. $protocol =~ tr/A-Z/a-z/; ## ensure lower-case.
  440. ##
  441. ## Http support here probably not kosher, but fits in nice for basic
  442. ## authorization.
  443. ##
  444. if ($protocol eq 'ftp' || $protocol eq 'telnet' || $protocol eq 'http')
  445. {
  446. ## Glean a username and password from address, if there.
  447. ## There if address starts with USER[:PASSWORD]@
  448. if ($address =~ s/^(([^\@:]+)(:([^@]+))?\@)//) {
  449. ($user, $password) = ($2, $4);
  450. }
  451. }
  452. ##
  453. ## address left is (HOSTNAME|HOSTNUM)[:PORTNUM]
  454. ##
  455. if ($address =~ s/:(\d+)$//) {
  456. $port = $1;
  457. } else {
  458. $port = $default_port{$protocol};
  459. }
  460. ## default path is '/';
  461. $path = '/' if !defined $path;
  462. ##
  463. ## If there's a proxy and we're to proxy this request, do so.
  464. ##
  465. local($proxy) = $ENV{$protocol."_proxy"};
  466. if (!$noproxy && defined($proxy) && !&no_proxy($protocol,$address))
  467. {
  468. local($dummy);
  469. local($old_pass, $old_user);
  470. ##
  471. ## Since we're going through a proxy, we want to send the
  472. ## proxy the entire URL that we want. However, when we're
  473. ## doing Authenticated HTTP, we need to take out the user:password
  474. ## that webget has encoded in the URL (this is a bit sleazy on
  475. ## the part of webget, but the alternative is to have flags, and
  476. ## having them part of the URL like with FTP, etc., seems a bit
  477. ## cleaner to me in the context of how webget is used).
  478. ##
  479. ## So, if we're doing this slezy thing, we need to construct
  480. ## the new URL from the compnents we have now (leaving out password
  481. ## and user), decode the proxy URL, then return the info for
  482. ## that host, a "filename" of the entire URL we really want, and
  483. ## the user/password from the original URL.
  484. ##
  485. ## For all other things, we can just take the original URL,
  486. ## ensure it has a protocol on it, and pass it as the "filename"
  487. ## we want to the proxy host. The difference between reconstructing
  488. ## the URL (as for HTTP Authentication) and just ensuring the
  489. ## protocol is there is, except for the user/password stuff,
  490. ## nothing. In theory, at least.
  491. ##
  492. if ($protocol eq 'http' && (defined($password) || defined($user)))
  493. {
  494. $path = "http://$address$path";
  495. $old_pass = $password;
  496. $old_user = $user;
  497. } else {
  498. ## Re-get original URL and ensure protocol// actually there.
  499. ## This will become our new path.
  500. ($path = $_) =~ s,^($protocol:)?/*,$protocol://,i;
  501. }
  502. ## note what the target will be
  503. $target = ($port==$default_port{$protocol})?$address:"$address:$port";
  504. ## get proxy info, discarding
  505. ($protocol, $address, $port, $dummy, $dummy, $user, $password)
  506. = &grok_URL($proxy, 1);
  507. $password = $old_pass if defined $old_pass;
  508. $user = $old_user if defined $old_user;
  509. }
  510. ($protocol, $address, $port, $path, $target, $user, $password);
  511. }
  512. ##
  513. ## &no_proxy($protocol, $host)
  514. ##
  515. ## Returns true if the specified host is identified in the no_proxy
  516. ## environmental variable, or identify the proxy server itself.
  517. ##
  518. sub no_proxy
  519. {
  520. local($protocol, $targethost) = @_;
  521. local(@dests, $dest, $host, @hosts, $aliases);
  522. local($proxy) = $ENV{$protocol."_proxy"};
  523. return 0 if !defined $proxy;
  524. $targethost =~ tr/A-Z/a-z/; ## ensure all lowercase;
  525. @dests = ($proxy);
  526. push(@dests,split(/\s*,\s*/,$ENV{'no_proxy'})) if defined $ENV{'no_proxy'};
  527. foreach $dest (@dests)
  528. {
  529. ## just get the hostname
  530. $host = (&grok_URL($dest, 1), 'http')[1];
  531. if (!defined $host) {
  532. warn "can't grok [$dest] from no_proxy env.var.\n";
  533. next;
  534. }
  535. @hosts = ($host); ## throw in original name just to make sure
  536. ($host, $aliases) = (gethostbyname($host))[0, 1];
  537. if (defined $aliases) {
  538. push(@hosts, ($host, split(/\s+/, $aliases)));
  539. } else {
  540. push(@hosts, $host);
  541. }
  542. foreach $host (@hosts) {
  543. next if !defined $host;
  544. return 1 if "\L$host" eq $targethost;
  545. }
  546. }
  547. return 0;
  548. }
  549. sub ensure_proper_network_library
  550. {
  551. require 'network.pl' if !defined $network'version;
  552. warn "WARNING:\n". __FILE__ .
  553. qq/ needs a newer version of "network.pl"\n/ if
  554. !defined($network'version) || $network'version < "950311.5";
  555. }
  556. ##
  557. ##>
  558. ##############################################################################
  559. ## open_http_connection(*FILEHANDLE, HOST, PORT, PATH, TARGET, OPTIONS...)
  560. ##
  561. ## Opens an HTTP connection to HOST:PORT and requests PATH.
  562. ## TARGET is used only for informational messages to the user.
  563. ##
  564. ## If PORT and PATH are undefined, HOST is taken as an http URL and TARGET
  565. ## is filled in as needed.
  566. ##
  567. ## Otherwise, it's the same as open_http_url (including return value, etc.).
  568. ##<
  569. ##
  570. sub open_http_connection
  571. {
  572. local(*HTTP, $host, $port, $path, $target, @options) = @_;
  573. local($post_text, @error, %seen);
  574. local(%info);
  575. &ensure_proper_network_library;
  576. ## options allowed:
  577. local($post, $retry, $authorization, $nofollow, $noproxy,
  578. $head, $debug, $ifmodifiedsince, $quiet, ) = (0) x 10;
  579. ## parse options:
  580. foreach $opt (@options)
  581. {
  582. next unless defined($opt) && $opt ne '';
  583. local($var, $val);
  584. if ($opt =~ m/^(\w+)=(.*)/) {
  585. ($var, $val) = ($1, $2);
  586. } else {
  587. $var = $opt;
  588. $val = 1;
  589. }
  590. $var =~ tr/A-Z/a-z/; ## ensure variable is lowercase.
  591. local(@error);
  592. eval "if (defined \$$var) { \$$var = \$val; } else { \@error =
  593. ('error', 'bad open_http_connection option [$opt]'); }";
  594. return ('error', "open_http_connection eval: $@") if $@;
  595. return @error if defined @error;
  596. }
  597. $quiet = 0 if $debug; ## debug overrides quiet
  598. local($protocol, $error, $code, $URL, %info, $tmp, $aite);
  599. ##
  600. ## if both PORT and PATH are undefined, treat HOST as a URL.
  601. ##
  602. unless (defined($port) && defined($path))
  603. {
  604. ($protocol,$host,$port,$path,$target)=&grok_URL($host,$noproxy,'http');
  605. if ($protocol ne "http") {
  606. return ('error',"open_http_connection doesn't grok [$protocol]");
  607. }
  608. unless (defined($host)) {
  609. return ('error', "can't grok [$URL]");
  610. }
  611. }
  612. return ('error', "no port in URL [$URL]") unless defined $port;
  613. return ('error', "no path in URL [$URL]") unless defined $path;
  614. RETRY: while(1)
  615. {
  616. ## we'll want $URL around for error messages and such.
  617. if ($port == $default_port{'http'}) {
  618. $URL = "http://$host";
  619. } else {
  620. $URL = "http://$host:$default_port{'http'}";
  621. }
  622. $URL .= ord($path) eq ord('/') ? $path : "/$path";
  623. $aite = defined($target) ? "$target via $host" : $host;
  624. &message($debug, "connecting to $aite ...") unless $quiet;
  625. ##
  626. ## note some info that might be of use to the caller.
  627. ##
  628. local(%preinfo) = (
  629. 'PROTOCOL', 'http',
  630. 'HOST', $host,
  631. 'PORT', $port,
  632. 'PATH', $path,
  633. );
  634. if (defined $target) {
  635. $preinfo{'TARGET'} = $target;
  636. } elsif ($default_port{'http'} == $port) {
  637. $preinfo{'TARGET'} = $host;
  638. } else {
  639. $preinfo{'TARGET'} = "$host:$port";
  640. }
  641. ## connect to the site
  642. $error = &network'connect_to(*HTTP, $host, $port);
  643. if (defined $error) {
  644. return('error', "can't connect to $aite: $error", %preinfo);
  645. }
  646. ## If we're asked to POST and it looks like a POST, note post text.
  647. if ($post && $path =~ m/\?/) {
  648. $post_text = $'; ## everything after the '?'
  649. $path = $`; ## everything before the '?'
  650. }
  651. ## send the POST or GET request
  652. $tmp = $head ? 'HEAD' : (defined $post_text ? 'POST' : 'GET');
  653. &message($debug, "sending request to $aite ...") if !$quiet;
  654. print HTTP $tmp, " $path HTTP/1.0\n";
  655. ## send the If-Modified-Since field if needed.
  656. if ($ifmodifiedsince) {
  657. print HTTP "If-Modified-Since: $ifmodifiedsince\n";
  658. }
  659. ## oh, let's sputter a few platitudes.....
  660. print HTTP "Accept: */*\n";
  661. print HTTP "User-Agent: $useragent\n" if defined $useragent;
  662. ## If doing Authorization, do so now.
  663. if ($authorization) {
  664. print HTTP "Authorization: Basic ",
  665. &htuu_encode($authorization), "\n";
  666. }
  667. ## If it's a post, send it.
  668. if (defined $post_text)
  669. {
  670. print HTTP "Content-type: application/x-www-form-urlencoded\n";
  671. print HTTP "Content-length: ", length $post_text, "\n\n";
  672. print HTTP $post_text, "\n";
  673. }
  674. print HTTP "\n";
  675. &message($debug, "waiting for data from $aite ...") unless $quiet;
  676. ## we can now read the response (header, then body) via HTTP.
  677. binmode(HTTP); ## just in case.
  678. ($code, %info) = &read_http_header(*HTTP);
  679. &message(1, "header returns code $code ($info{'TYPE'})") if $debug;
  680. ## fill in info from %preinfo
  681. local($val, $key);
  682. while (($val, $key) = each %preinfo) {
  683. $info{$val} = $key;
  684. }
  685. if ($code == 0)
  686. {
  687. return('error',"empty response for $URL")
  688. if $info{'TYPE'} eq 'empty';
  689. return('error', "non-HTTP response for $URL", %info)
  690. if $info{'TYPE'} eq 'unknown';
  691. return('error', "unknown zero-code for $URL", %info);
  692. }
  693. if ($code == 302) ## 302 is magic for "Found"
  694. {
  695. if (!defined $info{'location'}) {
  696. return('error', "No location info for Found URL $URL", %info);
  697. }
  698. local($newURL) = $info{'location'};
  699. ## Remove :80 from hostname, if there. Looks ugly.
  700. $newURL =~ s,^(http:/+[^/:]+):80/,$1/,i;
  701. $info{"NewURL"} = $newURL;
  702. ## if we're not following links or if it's not to HTTP, return.
  703. return('follow', $newURL, %info) if
  704. $nofollow || $newURL!~m/^http:/i;
  705. ## note that we've seen this current URL.
  706. $seen{$host, $port, $path} = 1;
  707. &message(1, qq/[note: now moved to "$newURL"]/) unless $quiet;
  708. ## get the new one and return an error if it's been seen.
  709. ($protocol, $host, $port, $path, $target) =
  710. &www'grok_URL($newURL, $noproxy);
  711. &message(1, "[$protocol][$host][$port][$path]") if $debug;
  712. if (defined $seen{$host, $port, $path})
  713. {
  714. return('error', "circular reference among:\n ".
  715. join("\n ", sort grep(/^http/i, keys %seen)), %seen);
  716. }
  717. next RETRY;
  718. }
  719. elsif ($code == 500) ## 500 is magic for "internal error"
  720. {
  721. ##
  722. ## A proxy will often return this with text saying "can't find
  723. ## host" when in reality it's just because the nslookup returned
  724. ## null at the time. Such a thing should be retied again after a
  725. ## few seconds.
  726. ##
  727. if ($retry)
  728. {
  729. local($_) = $info{'BODY'} = join('', <HTTP>);
  730. if (/Can't locate remote host:\s*(\S+)/i) {
  731. local($times) = ($retry == 1) ?
  732. "once more" : "up to $retry more times";
  733. &message(0, "can't locate $1, will try $times ...")
  734. unless $quiet;
  735. sleep(5);
  736. $retry--;
  737. next RETRY;
  738. }
  739. }
  740. }
  741. if ($code != 200) ## 200 is magic for "OK";
  742. {
  743. ## I'll deal with these as I see them.....
  744. &clear_message;
  745. if ($info{'TYPE'} eq '')
  746. {
  747. if (defined $http_return_code{$code}) {
  748. $info{'TYPE'} = $http_return_code{$code};
  749. } else {
  750. $info{'TYPE'} = "(unknown status code $code)";
  751. }
  752. }
  753. return ('status', $info{'TYPE'}, %info);
  754. }
  755. &clear_message;
  756. return ('ok', 'ok', %info);
  757. }
  758. }
  759. ##
  760. ## Hyper Text UUencode. Somewhat different from regular uuencode.
  761. ##
  762. ## Logic taken from Mosaic for X code by Mark Riordan and Ari Luotonen.
  763. ##
  764. sub htuu_encode
  765. {
  766. local(@in) = unpack("C*", $_[0]);
  767. local(@out);
  768. push(@in, 0, 0); ## in case we need to round off an odd byte or two
  769. while (@in >= 3) {
  770. ##
  771. ## From the next three input bytes,
  772. ## construct four encoded output bytes.
  773. ##
  774. push(@out, $in[0] >> 2);
  775. push(@out, (($in[0] << 4) & 060) | (($in[1] >> 4) & 017));
  776. push(@out, (($in[1] << 2) & 074) | (($in[2] >> 6) & 003));
  777. push(@out, $in[2] & 077);
  778. splice(@in, 0, 3); ## remove these three
  779. }
  780. ##
  781. ## @out elements are now indices to the string below. Convert to
  782. ## the appropriate actual text.
  783. ##
  784. foreach $new (@out) {
  785. $new = substr(
  786. "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/",
  787. $new, 1);
  788. }
  789. if (@in == 2) {
  790. ## the two left over are the two extra nulls, so we encoded the proper
  791. ## amount as-is.
  792. } elsif (@in == 1) {
  793. ## We encoded one extra null too many. Undo it.
  794. $out[$#out] = '=';
  795. } else {
  796. ## We must have encoded two nulls... Undo both.
  797. $out[$#out ] = '=';
  798. $out[$#out -1] = '=';
  799. }
  800. join('', @out);
  801. }
  802. ##
  803. ## This message stuff really shouldn't be here, but in some seperate library.
  804. ## Sorry.
  805. ##
  806. ## Called as &message(SAVE, TEXT ....), it shoves the text to the screen.
  807. ## If SAVE is true, bumps the text out as a printed line. Otherwise,
  808. ## will shove out without a newline so that the next message overwrites it,
  809. ## or it is clearded via &clear_message().
  810. ##
  811. sub message
  812. {
  813. local($nl) = shift;
  814. die "oops $nl." unless $nl =~ m/^\d+$/;
  815. local($text) = join('', @_);
  816. local($NL) = $nl ? "\n" : "\r";
  817. $thislength = length($text);
  818. if ($thislength >= $last_message_length) {
  819. print STDERR $text, $NL;
  820. } else {
  821. print STDERR $text, ' 'x ($last_message_length-$thislength), $NL;
  822. }
  823. $last_message_length = $nl ? 0 : $thislength;
  824. }
  825. sub clear_message
  826. {
  827. if ($last_message_length) {
  828. print STDERR ' ' x $last_message_length, "\r";
  829. $last_message_length = 0;
  830. }
  831. }
  832. 1;
  833. __END__
  834. __END__
  835. :endofperl