I wrote this perl script that analizes a text and plot out a bar graph of the word occurrences during the text.
Since come mainly from Javascript and PHP, i would like to know how bad is this code and if there is room for improvement and how, any tip will be appreciated!
Thanks!
#!usr/bin/perl -w
use strict;
use warnings;
use GD::Graph::bars;
use GD::Graph::colour;
if (!$ARGV[0]) {
die "Error:\nPlease supply a txt file and a database of words to exclude\n";
}
my $file = $ARGV[0];
my $dbfile = $ARGV[1];
my $cnt = 0;
my %wordlist = ();
my @db = ();
my $threshold = 18;
binmode STDOUT,"utf8";
open (HNDL,'<:utf8', $file) || die "wrong filename";
if ($dbfile){
open (HDB,'<:utf8', $dbfile) || die "wrong filename";
@db = <HDB>; # load file into array
foreach my $db (@db) {
# remove newlines
#$db =~ s/\n\s*//;
chomp($db);
}
}
while (my $val = <HNDL>)
{
my @val = split / /, $val; # split each space build words array
foreach my $word (@val){
# clean the word remove newlines
#$word =~ s/\n\s*//;
chomp($word);
# clean the word remove everything until '
$word =~ s/^.\'//;
# clean the word remove any nonword characters
# $word =~ s/[^a-zA-Z_0-9]//;
# clean the word remove dots and commas
$word =~ s/,|\.//;
if (length($word) < 2){
next;
};
# check if the word is to be excluded
my $exclude = 0;
if ($dbfile){
foreach my $dbword (@db) {
# check if the string contains dbword
# ^ check for occurence at start
# $ check for occurence at end
# /i case insensitive
if ($word =~ m/^$dbword$/i){
$exclude = 1;
#print "Escluso: $word";
}
}
}
if($exclude == 0) {
# check word in the hash
if (exists $wordlist{$word}) {
#if present augment value by one
$wordlist{$word}++;
}else{
#if not present add
$wordlist{$word} = 1;
}
}else{
# do nothing
}
}
}
# Export CSV
# Print header
print ("Word,Value\n");
# print hash elements for each sorted element
for my $key ( sort { $wordlist{$a} <=> $wordlist{$b} } keys %wordlist){
#print "$key ---> $wordlist{$key}\n\n";
if ($wordlist{$key} >= $threshold){
print "$key,$wordlist{$key}\n";
}
}
# Export Graph
my $graph = GD::Graph::bars->new(1024, 600);
# Build data arrays
my @xdata = ();
my @ydata = ();
my @data = ();
for my $key ( sort { $wordlist{$a} <=> $wordlist{$b} } keys %wordlist){
#print "$key ---> $wordlist{$key}\n\n";
if ($wordlist{$key} >= $threshold){
push(@xdata,$key);
push(@ydata,$wordlist{$key});
}
}
push (@data,\@xdata);
push (@data,\@ydata);
$graph->set(
x_label => 'Words',
x_label_position => '0.5',
y_label => 'Number of Hits',
title => 'Number of Hits for Each Word',
bar_spacing => 5,
bar_width => 25,
transparent => '0',
bgclr => 'white'
) or warn $graph->error;
my $image = $graph->plot(\@data) or die $graph->error;
# write file to output
$file =~ s/\.[*]$//;
open(PNGOUT,">","$file.png");
binmode PNGOUT;
print PNGOUT $image->png;
close(PNGOUT);
close (HDB);
close (HNDL);
1 Answer 1
Use lexical filehandles instead of typeglobs (read here for more about why; the biggest problem with typeglobs is that they are global in scope).
open (my $handle,'<:utf8', $file) || die "wrong filename";
You should close your filehandles sooner rather than waiting til the end of the program, if possible. For example, there's no reason to keep
HDB
open after its contents have been read into memory.Consider changing
@db
to a hash, then usingexists
to check whether a word is in the exclude list. This should be faster than looping through the array (depending on how many words are in the exclude list and how often a word will be in the list). Since case is not important, you could calllc
oruc
before testing.If you want to keep it as an array, then consider exiting the loop using
last
once the first match is found.You sort the keys of
%wordlist
twice using the same criteria. Consider storing the sorted list in a new variable.