#!/usr/bin/perl # Extract sheets from xls files as tab seperated text files. # Currently some utf-8 characters are translated as well as a few html entities. # Based on a dump example that came with Spreadsheet::ParseExcel. use utf8; use locale ':not_characters'; use open ':locale'; use POSIX qw(locale_h); setlocale(LC_ALL, "en_US.utf8"); use Spreadsheet::ParseExcel; foreach $file (@ARGV) { print STDERR "Reading $file.\n"; $ex = new Spreadsheet::ParseExcel; $book = $ex->Parse($file); if (!$book) { print STDERR "Unable to parse $file.\n"; next; } foreach $sheet (@{$book->{Worksheet}}) { $name = $sheet->{Name}; $name = lc($name); $name =~ s/[^abcdefghijklmnopqrstuvwxyz[:digit:]]//g; $bname = $name; $count = 0; while (-e "new/$name.tsv") { $count++; $name = "UNIQUE$count-$bname"; } open(FILE, ">new/$name.tsv") || die "Unable to open $name.\n"; print STDERR "Writing $bname to new/$name.tsv.\n"; print FILE "\n" x $sheet->{MinRow}; for($row = $sheet->{MinRow} ; defined $sheet->{MaxRow} && $row <= $sheet->{MaxRow} ; $row++) { $new = 1; print FILE "\t" x $sheet->{MinCol}; for($col = $sheet->{MinCol} ; defined $sheet->{MaxCol} && $col <= $sheet->{MaxCol} ; $col++) { $cell = $sheet->{Cells}[$row][$col]; if ($new) { $new = 0; } else { print FILE "\t"; } if ($cell) { $val = $cell->value; $val =~ s/’|‘/'/g; $val =~ s/<//g; $val =~ s/&/\&/g; utf8::decode($val); print FILE $val; } } print FILE "\n"; } close(FILE); } }