windows-server-2003/tools/parsetable.pm

#---------------------------------------------------------------------
package ParseTable;
#
#   Copyright (c) Microsoft Corporation. All rights reserved.
#
# Version: 1.00 (07/12/2000) : (JeremyD) inital version
#          1.01 (08/25/2000) : (JeremyD) allow single heading tables
#---------------------------------------------------------------------
use strict;
use vars qw(@ISA @EXPORT $VERSION);
use IO::File;
use Carp;
use Exporter;
@ISA = qw(Exporter);
@EXPORT = qw(parse_table_lines parse_table_file);

$VERSION = '1.01';


sub parse_table_lines (\@;$) {
    my $lines_ref = shift; # the array of lines is modified in place
    my $storage = shift; # an array or hash ref to stuff the data in, if 
                         #  this is not a ref we quietly discard the data
                         #  this could be useful to skip one table
    my @heading; # the current set of headings

  LINE:
    while (my $line = shift @$lines_ref) {
        chomp $line;
        next LINE if $line =~ /^\s*$/; # skip empty lines
        if ($line =~ /^\s*[#;](.*)/) { # comments may contain headings
            my $comment = $1;
            if ($comment =~ /^\s*(?:\[\w+\]\s*)+$/) { # bracketed names seperated
                                                    #  by whitespace
                if (@heading) { # already have headings, must be a new table
                    unshift @$lines_ref, $line; # this line is part of the next
                                                #  table, we need to put it back
                    last LINE; # a new table implies the end of the current one
                } else { # found our first set of headings
                    while ($comment =~ /\[(\w+)\]/g) { # look for headings
                        push @heading, $1;
                    }
                }
            }
            next LINE; # done parsing this comment
        }

        next unless @heading; # no data processing until we have our headings

        # fields are seperated by 2 or more white space characters, however
        #  a single tab will also suffice
        my @data = split /(?=\t)\s+|\s{2,}/, $line;

        next unless $#heading == $#data; # require 1 data field per heading

        # use our current headings as keys and make a hash of the data
        my %hash;
        for (my $i=0; $i<@heading; $i++) {
            $hash{$heading[$i]} = $data[$i];
        }

        # store our current line's data in the reference passed to us
        if (ref $storage eq 'ARRAY') {
            push @$storage, \%hash;
        } elsif (ref $storage eq 'HASH') {
            $storage->{$data[0]} = \%hash;
        } else {
            # do nothing
            #  this allows skipping a table by passing in a non-ref storage
        }
    }

    # the data array was modified in place, parsed lines have been removed
    #  successive calls will parse any remaining tables found in the data array
    #  return the number of unparsed lines, 0 indicates no remaining tables
    return scalar @$lines_ref;
}

sub parse_table_file ($;@) {
    my $filename = shift;
    my @store_refs = @_;
    my $fh = new IO::File $filename, "r";
    if (defined $fh) {
        my @lines = $fh->getlines;
        my $i = 0;
        while (@lines) {
            parse_table_lines(@lines, $store_refs[$i++]);
        }
        undef $fh;
    } else {
        croak "Unable to open file $filename: $!";
    }
}


1;

__END__

=head1 NAME

ParseTable - Extract data from a formatted text table

=head1 SYNOPSIS

  use ParseTable;

  parse_table_file("foobar.txt", \%table_one, \@table_two, ...);

  $lines_remaining = parse_table_lines(@data_lines,\%table);


=head1 DESCRIPTION

This module provides an easy way to extract formatted data from text files. 

=over 4

=item parse_table_file( $filename, @storage_refs )

parse_table_file takes a filename to parse and a list of storage locations 
for the tables found within that file.

=item parse_table_lines( @data_lines, $storage_ref )

parse_table_lines takes an array of data lines and a storage location for 
the first table found in the lines. It modifies the array in place and returns 
the number of unparsed lines.

=back

The format for a table is:

 ;comments
 ; [heading1] [heading2]
 item1  item2
 item3 with internal space  item4
 item5	item6


Each line of data in a table is stored as a hash with the heading names as 
keys and the data items as values.

If an array reference is specified as the storage location the data hash for 
each line will be pushed on to the array.

If the storage location is a hash reference then the data hash for each line 
will be stored using the value of the first column as the key. In the case of 
duplicate data items the last one appearing in the table takes precedence.

=head1 EXAMPLES

 parse_table_file("codetable.txt",\@data)
 for $data (@data) {
     print "$data->{Lang} is the lang code for $data->{Comments}\n";
 }


 parse_table_file("codetable.txt",\%data,\%flavors)
 print "your site is $data->{$user_lang}{Site}\n";
 print "your flavor is $flavors->{$user_lang}{$user_arch}\n";


 codetable.txt:
 ;
 ;     This is just an example of a file with two tables
 ;


 ;[Lang] [LCID] [Class] [Site]  [Comments] 
 ;-------------------------------------------------------------
 ;       
 ARA  0x0401    @CS    REDMOND  Arabic
 CHS  0x0804    @FE    REDMOND  Chinese Simplified (PR China)
 CHT  0x0404    @FE    REDMOND  Chinese Traditional (Taiwan Region)
 CHH  0x0404    @FE    REDMOND  Chinese Traditional (Hong Kong Region)
 FR   0x040C    @EU    DUBLIN   French
 GER  0x0407    @EU    REDMOND  German
 ;[Lang]        [x86]                 [ia64]
 ;=============================================
 USA            per;pro;srv;ads;dtc   pro;ads;dtc
 GER            per;pro;srv;ads       pro;ads
 CHT            per;pro;srv;ads       pro;ads
 CHH            per;pro;srv;ads       pro;ads
 CHS            per;pro;srv;ads       pro;ads
 ARA            per;pro               pro

=head1 NOTES

The parser can handle blank lines and comments beginning with either ';' or 
'#'.

A heading line must appear before any data lines. A heading line is a special 
form of comment consisting of field names enclosed in brackets [].

Data lines must have exactly as many fields as heading lines.

Data fields must be seperated by 2 or more spaces. Single spaces within data 
items do not require quoting or escaping.

Quoting and escaping are not supported in any way. This means you may not 
have a data field with the value "" (empty string) or more than 1 space in a row.

Storage locations are not before parsing begins.

Heading names must match the regex /\w+/.

Should probably be expanded to handle returning a plain array for single column 
tables (lists of filenames, etc).

=head1 SEE ALSO

  hashtext.pm

=head1 AUTHOR

Jeremy Devenport <JeremyD>

=head1 COPYRIGHT

Copyright (c) Microsoft Corporation. All rights reserved.

=cut