# Read all of standard input, assuming that it is a catapult log file.

while (<>) {

    # Do dummy-dot logic--print a dot for every 1000 lines of input.

    $tmptotal++;
    if ($tmptotal >= 1000) {
        print STDERR ".";
        flush;
        $tmptotal = 0;
    }

    # Parse a line of the log file into variables.

    ($machine, $user, $date, $time, $x, $y, $host, $ms, $size, $sent,
         $protocol, $ret, $verb, $object, $inet) = split( /, /, $_ );

    # Act based on the URL entry type--HTTP, FTP, or Gopher.

    if ( $protocol == 3 ) {

        # Ignore all entries with size of zero; these are typically errors.

        if ($size == 0) {

            $zeros += 1;

        } else {
    
            $httpbytes += $size;
            $http += 1;
            $httparray{$size} += 1;
            $httpmax = $size if ($size > $httpmax);

            # If this is an HTML, GIF or JPG HTTP URL, count it specially.
            # Note that we just use the file extension to grok this
            # information, which may be inaccurate.

            if (/.[Hh][Tt][Mm][Ll]?, /) {
                $htm += 1;
                $htmbytes += $size;
                $htmarray{$size} += 1;
                $htmmax = $size if ($size > $htmmax);
            } elsif (m|/[^. ]*, INet,|) {
                $htm += 1;
                $htmbytes += $size;
                $htmarray{$size} += 1;
                $htmmax = $size if ($size > $htmmax);
            } elsif (/.[Gg][Ii][Ff], /) {
                $gif += 1;
                $gifbytes += $size;
                $gifarray{$size} += 1;
                $gifmax = $size if ($size > $gifmax);
            } elsif (/.[Jj][Pp][Ee]?[Gg], /) {
                $jpg += 1;
                $jpgbytes += $size;
                $jpgarray{$size} += 1;
                $jpgmax = $size if ($size > $jpgmax);
            } else {
                $other += 1;
                $otherbytes += $size;
                $otherarray{$size} += 1;
                $othermax = $size if ($size > $othermax);
            }
        }

    } elsif ( $protocol == 1 ) {

        $ftpbytes += $size;
        $ftp++;
        $ftparray{$size} += 1;
        $ftpmax = $size if ($size > $ftpmax);

    } elsif ( $protocol == 2 ) {

        $gopherbytes += $size;
        $gopher++;
        $gopherarray{$size} += 1;
        $gophermax = $size if ($size > $gophermax);
    }
}

#Display statistics for each protocol.

$totalurls = $ftp+$gopher+$http;
print "\n";

$~ = PROTO;
$^ = PROTO_TOP;
display_stats("HTTP", $http, $httpbytes, *httparray, $httpmax);
display_stats("  HTTP HTML", $htm, $htmbytes, *htmarray, $htmmax);
display_stats("  HTTP GIF", $gif, $gifbytes, *gifarray, $gifmax);
display_stats("  HTTP JPEG", $jpg, $jpgbytes, *jpgarray, $jpgmax);
display_stats("  HTTP Other", $other, $otherbytes, *otherarray, $othermax);
display_stats("FTP", $ftp, $ftpbytes, *ftparray, $ftpmax);
display_stats("Gopher", $gopher, $gopherbytes, *gopherarray, $gophermax);

# Now display the distribution of HTTP URL sizes.  While doing this,
# calculate the median HTTP URL size.

print "\nHTTP URL Size distribution:\n";

$~ = DIST;
$^ = DIST_TOP;
$- = 0;
$power = 32;
$httpmedian = 0;
$quitearly = 0;
$current = 0;

while ($i < $http) {
    $i += $httparray{$current};
    $powertotal += $httparray{$current};
    $current += 1;
    if ($current == $power*2) {
        $power *= 2;
        $powerpct = (int(10000*($powertotal/$http)))/100;
        write;
        $powertotal = 0;
    }
    if ($current > 65536) {
    #if ($current > 2097152) {
        $powertotal = $http - $i;
        $powerpct = (int(10000*($powertotal/$http)))/100;
        $power = 99999999999;
        write;
        $i = $http;
        $quitearly = 1;
    }
}

if ($quitearly == 0) {
    $power *= 2;
    $powerpct = (int(10000*($powertotal/$http)))/100;
    write;
    $powertotal = 0;
}

sub display_stats {

    local($protoname, $protocount, $protobytes, *array, $max) = @_;

    if ($protocount > 0) {
        $pct = ($protocount/$totalurls)*100;
        $avg = int($protobytes/$protocount);

        $median = 0;
        $i = 0;
        while ($i < $protocount/2) {
            $i += $array{$median};
            $median += 1;
            #print "i = $i, median = $median, array = $array{$median}\n";
        }
    
        write;

    } else {
        $pct = 0;
        $avg = 0;
        $median = 0;
        write;
    }
}

#########################################################################

format PROTO_TOP =
Protocol         # URLs     % of total     Average     Median    Max Size
-----------------------------------------------------------------------------
.

format PROTO =
@<<<<<<<<<<<  @########     @##.##       @#######   @#######   @#########
$protoname, $protocount, $pct, $avg, $median, $max
.

format DIST_TOP =
    Up to size       Count        Percent of total
.

format DIST =
    @###########    @#######       @##.##%
    $power, $powertotal, $powerpct
.