#--------------------------------------------------------------------- package ParseTable; # # Copyright (c) Microsoft Corporation. All rights reserved. # # Version: 1.00 (07/12/2000) : (JeremyD) inital version # 1.01 (08/25/2000) : (JeremyD) allow single heading tables #--------------------------------------------------------------------- use strict; use vars qw(@ISA @EXPORT $VERSION); use IO::File; use Carp; use Exporter; @ISA = qw(Exporter); @EXPORT = qw(parse_table_lines parse_table_file); $VERSION = '1.01'; sub parse_table_lines (\@;$) { my $lines_ref = shift; # the array of lines is modified in place my $storage = shift; # an array or hash ref to stuff the data in, if # this is not a ref we quietly discard the data # this could be useful to skip one table my @heading; # the current set of headings LINE: while (my $line = shift @$lines_ref) { chomp $line; next LINE if $line =~ /^\s*$/; # skip empty lines if ($line =~ /^\s*[#;](.*)/) { # comments may contain headings my $comment = $1; if ($comment =~ /^\s*(?:\[\w+\]\s*)+$/) { # bracketed names seperated # by whitespace if (@heading) { # already have headings, must be a new table unshift @$lines_ref, $line; # this line is part of the next # table, we need to put it back last LINE; # a new table implies the end of the current one } else { # found our first set of headings while ($comment =~ /\[(\w+)\]/g) { # look for headings push @heading, $1; } } } next LINE; # done parsing this comment } next unless @heading; # no data processing until we have our headings # fields are seperated by 2 or more white space characters, however # a single tab will also suffice my @data = split /(?=\t)\s+|\s{2,}/, $line; next unless $#heading == $#data; # require 1 data field per heading # use our current headings as keys and make a hash of the data my %hash; for (my $i=0; $i<@heading; $i++) { $hash{$heading[$i]} = $data[$i]; } # store our current line's data in the reference passed to us if (ref $storage eq 'ARRAY') { push @$storage, \%hash; } elsif (ref $storage eq 'HASH') { $storage->{$data[0]} = \%hash; } else { # do nothing # this allows skipping a table by passing in a non-ref storage } } # the data array was modified in place, parsed lines have been removed # successive calls will parse any remaining tables found in the data array # return the number of unparsed lines, 0 indicates no remaining tables return scalar @$lines_ref; } sub parse_table_file ($;@) { my $filename = shift; my @store_refs = @_; my $fh = new IO::File $filename, "r"; if (defined $fh) { my @lines = $fh->getlines; my $i = 0; while (@lines) { parse_table_lines(@lines, $store_refs[$i++]); } undef $fh; } else { croak "Unable to open file $filename: $!"; } } 1; __END__ =head1 NAME ParseTable - Extract data from a formatted text table =head1 SYNOPSIS use ParseTable; parse_table_file("foobar.txt", \%table_one, \@table_two, ...); $lines_remaining = parse_table_lines(@data_lines,\%table); =head1 DESCRIPTION This module provides an easy way to extract formatted data from text files. =over 4 =item parse_table_file( $filename, @storage_refs ) parse_table_file takes a filename to parse and a list of storage locations for the tables found within that file. =item parse_table_lines( @data_lines, $storage_ref ) parse_table_lines takes an array of data lines and a storage location for the first table found in the lines. It modifies the array in place and returns the number of unparsed lines. =back The format for a table is: ;comments ; [heading1] [heading2] item1 item2 item3 with internal space item4 item5 item6 Each line of data in a table is stored as a hash with the heading names as keys and the data items as values. If an array reference is specified as the storage location the data hash for each line will be pushed on to the array. If the storage location is a hash reference then the data hash for each line will be stored using the value of the first column as the key. In the case of duplicate data items the last one appearing in the table takes precedence. =head1 EXAMPLES parse_table_file("codetable.txt",\@data) for $data (@data) { print "$data->{Lang} is the lang code for $data->{Comments}\n"; } parse_table_file("codetable.txt",\%data,\%flavors) print "your site is $data->{$user_lang}{Site}\n"; print "your flavor is $flavors->{$user_lang}{$user_arch}\n"; codetable.txt: ; ; This is just an example of a file with two tables ; ;[Lang] [LCID] [Class] [Site] [Comments] ;------------------------------------------------------------- ; ARA 0x0401 @CS REDMOND Arabic CHS 0x0804 @FE REDMOND Chinese Simplified (PR China) CHT 0x0404 @FE REDMOND Chinese Traditional (Taiwan Region) CHH 0x0404 @FE REDMOND Chinese Traditional (Hong Kong Region) FR 0x040C @EU DUBLIN French GER 0x0407 @EU REDMOND German ;[Lang] [x86] [ia64] ;============================================= USA per;pro;srv;ads;dtc pro;ads;dtc GER per;pro;srv;ads pro;ads CHT per;pro;srv;ads pro;ads CHH per;pro;srv;ads pro;ads CHS per;pro;srv;ads pro;ads ARA per;pro pro =head1 NOTES The parser can handle blank lines and comments beginning with either ';' or '#'. A heading line must appear before any data lines. A heading line is a special form of comment consisting of field names enclosed in brackets []. Data lines must have exactly as many fields as heading lines. Data fields must be seperated by 2 or more spaces. Single spaces within data items do not require quoting or escaping. Quoting and escaping are not supported in any way. This means you may not have a data field with the value "" (empty string) or more than 1 space in a row. Storage locations are not before parsing begins. Heading names must match the regex /\w+/. Should probably be expanded to handle returning a plain array for single column tables (lists of filenames, etc). =head1 SEE ALSO hashtext.pm =head1 AUTHOR Jeremy Devenport =head1 COPYRIGHT Copyright (c) Microsoft Corporation. All rights reserved. =cut