#!/usr/bin/perl # # $Id: superstat.pl,v 1.4 1999/11/15 13:47:00 peten Exp peten $ # ### superstat (c)1999 Pete Nelson ## http://www.serversolved.com/superstat/ ## ## You may freely use, edit, and distribute this program free of charge, ## under the condition that the copyrite information remains in tact. ## ## The sale of this program is expressly forbidden. ## ############################################################### ## You may need to change these variables for your system: ## ## Set '$DEFAULT_CONFIG_FILE' to where ever you keep ## superstat.cfg, and have '$null_dev' point ## to '/dev/null' (Unix) or 'nul' (DOS). $DEFAULT_CONFIG_FILE = "superstat.cfg"; $null_dev = "/dev/null"; ## Modules Used - Storable, Socket, & Getopt::Long ## You may have to comment these out - if so, just set ## $NO_STOR, $NO_SOCK, & $NO_OPT appropriately. ## (See README) use Storable; # || $NO_STOR = 1; use Socket; # || $NO_SOCK = 1; use Getopt::Long; # || $NO_OPT = 1; ## That should be all you need to configure here. ## Use superstat.cfg for any other options ############################################################### unless($NO_OPT) { GetOptions( "graph!" => \$graph, "hostlookup!" => \$hostlookup, "use_meta_default!" => \$use_meta_default, "silent" => \$silent, "quiet" => \$silent, "infile=s" => \$infile, "outfile=s" => \$outfile, "least_hits=i" => \$least_hits, "max_dirs=i" => \$max_dirs, "max_clients=i" => \$max_clients, "hidden_dirs=s" => \$hidden, "server=s" => \$server, "body_def=s" => \$body_def, "table_def=s" => \$table_def, "table_header_def=s" => \$table_header_def, "title=s" => \$html_title, "img_dir=s" => \$img_dir, "config=s" => \$config ); } $config_file = $config || $DEFAULT_CONFIG_FILE; if($hidden) { @hidden_dirs = split(/ /, $hidden); } if(-e $config_file) { &parse_config($config_file); } ## Default Variables for generated HTML: ## #$body_def ||= "bgcolor=white"; #$table_def ||= "border=0 bgcolor=#99CCFF cellpadding=5 cellspacing=5"; #$img_dir ||= "/images"; ## IMPORTANT: No trailing '/'! $least_hits ||= 10; $max_dirs ||= 50; $max_clients ||= 50; $max_referers ||= 50; ################# Logging Formats ################################ $common = q/^(\S+) (\S+) (\S+) \[([^:]+):(\d+:\d+:\d+) ([^\]]+)\] "(\S+) (.*?) (\S+)" (\d\d\d) (\d+) "(\S+)" "([^"]*)"/; $friday = q/^(\S+) (\S+) (\S+) \[([^:]+):(\d+:\d+:\d+) ([^\]]+)\] "(\S+) (.*?) (\S+)" (\d\d\d) "?(\S*)"? (\d+) "?(.*)"?/; $simple = q/^(\S+) (\S+) (\S+) \[([^:]+):(\d+:\d+:\d+) ([^\]]+)\] "(\S+) (.*?) (\S+)" (\S+) (\S+)/; ################################################################## unless(($infile) || ($infile = $ARGV[0])) { exit(1) if $silent; print "Please enter the name of the log file:\n"; $infile = <>; chomp($infile); } unless(-e $infile) { die "'$infile' does not exist!"; } unless(($outfile) || ($outfile = $ARGV[1])) { exit(1) if $silent; print "Please enter the name of the file you would like generated:\n"; $outfile = <>; chomp($outfile); } if($outfile =~ /stdout/i) { $silent = 1; } if($silent) { open(NULLDEV, ">$null_dev"); $MSG = *NULLDEV; } else { $MSG = *STDOUT; } if($use_meta_default && -e ($infile . ".meta")) { print $MSG "Found metafile '$infile.meta'\n"; $infile .= ".meta"; } if($infile =~ /.*\.meta$/ && (!$NO_STOR)) { &import_meta; } else { &parse_log; &make_meta unless $NO_STOR; } &sort_data; &make_html; exit(0); ################################################################### sub parse_log { $hits = 0; $beg_date = ""; $log_type = ""; print $MSG "Loading '$infile'\n"; open(LOG, $infile) ||die "Can't access '$infile'"; @bfa = ; close(LOG); print $MSG "'$infile' - $#bfa lines.\nGenerating hashes\n"; $lines = $#bfa; $line_len = length($lines); $proc_len = length(" lines processed."); print $MSG " " x $line_len; print $MSG " lines processed."; foreach $line (@bfa) { if($line =~ /$common/o) { $log_type = "common"; $client = $1; $usr = $2; $auth = $3; $date = $4; $time = $5; $tz = $6; $method = $7; $url = $8; $proto = $9; $status = $10; $bytes = ($11 eq "-") ? 0 : $11; $referrer = ($12 =~ /^"?-"?$/o) ? "[NONE]" : $12; $referrer =~ s/"//go; $user_agent = $13; $hits++; } elsif($line =~ /$friday/o) { $client = $1; $usr = $2; $auth = $3; $date = $4; $time = $5; $tz = $6; $method = $7; $url = $8; $proto = $9; $status = $10; $referrer = ($11 =~ /^-"?$/o) ? "[NONE]" : $11; $referrer =~ s/"//go; $bytes = ($12 eq "-") ? 0 : $12; $user_agent = $13; $hits++; } elsif($line =~ /$simple/o) { $log_type = "simple"; $client = $1; $usr = $2; $auth = $3; $date = $4; $time = $5; $tz = $6; $method = $7; $url = $8; $proto = $9; $status = $10; $bytes = ($11 eq "-") ? 0 : $11; $hits++; } if($user_agent =~ /^\s+-\s+$/o) { undef($user_agent); } if(($hits == 1000) || (($hits > 1000) && (($hits % 1000) == 0))) { $| = 1; print $MSG "\b" x ($line_len + $proc_len); print $MSG " " x ($line_len - length($hits)); print $MSG "$hits lines processed."; } if($beg_date eq "") { $beg_date = $date; } ($hr, $min, $sec) = split/:/, $time; $dt = "$date-$hr"; $totalbytes = $totalbytes + $bytes; $hph{$dt}++; $url =~ s/\?.*$//o; $requests{$url}++ unless(status > 200); $clients{$client}++; $refers{$referrer}++ unless($referrer eq "[NONE]"); $stats{$status}++; unless (defined($dates{$date})) { push @dates, $date; } $dates{$date}++; if(defined($user_agent)) { if($user_agent =~ m#^Mozilla/5.*#o) { $agent_level{'5'}++; } elsif($user_agent =~ m#^Mozilla/4.*#o) { $agent_level{'4'}++; } elsif($user_agent =~ m#^Mozilla/3.*#o) { $agent_level{'3'}++; } elsif($user_agent =~ m#^Mozilla/2.*#o) { $agent_level{'2'}++; } if($user_agent =~ m#^Mozilla.*#o) { if($user_agent =~ /MSIE/o) { $agent{'Internet Explorer'}++; } else { $agent{'Netscape'}++; } } else { $agent{'Other'}++; $misc_browsers{$user_agent}++; } if($user_agent =~ /Win/o) { $platform{'Windows'}++; } elsif($user_agent =~ /Mac/o) { $platform{'Macintosh'}++; } else { $platform{'Unknown'}++; } } } print $MSG "\n'$infile' parsed.\n"; $totalK = int($totalbytes/1024); $end_date = $date; } ### # ## Sub to get hostname ## # ### sub get_hostname { my($ip) = shift; $| = 1; $SIG{ALRM} = sub { die "timeout" }; eval { alarm(5); $name = gethostbyaddr(inet_aton($ip), AF_INET); alarm(0); }; if($@) { if($@ =~ /timeout/) { $name = $ip; } } elsif($name eq '') { $name = $ip; } return $name; } ### ## Subs related to using the metafile # sub make_meta { print $MSG "Creating meta file\n"; %data = ( hits => \$hits, totalK => \$totalK, beg_date => \$beg_date, end_date => \$end_date, hph => \%hph, requests => \%requests, clients => \%clients, refers => \%refers, stats => \%stats, dates => \%dates, a_dates => \@dates, agent => \%agent, agent_level => \%agent_level, misc_browsers => \%misc_browsers, platform => \%platform ); if($infile =~ m#^(.*)/([^/]+)$#) { ($out_dir, $file) = ($1,$2); print $MSG "Checking if '$out_dir' is writeable\n"; if(-w $out_dir) { $metafile = "$infile.meta"; } else { print "Can't write to '$out_dir'\n"; $metafile = "./$file.meta"; } } else { $metafile = "$infile.meta"; } store(\%data, "$metafile") || "Can't create metafile $metafile\n"; print $MSG "\nHashes created.\n"; } sub import_meta { print $MSG "Loading '$infile'\n"; %data = %{ retrieve("$infile") }; $hits = ${ $data{hits} }; $totalK = ${ $data{totalK} }; $beg_date = ${ $data{beg_date} }; $end_date = ${ $data{end_date} }; %hph = %{ $data{hph} }; %requests = %{ $data{requests} }; %clients = %{ $data{clients} }; %refers = %{ $data{refers} }; %stats = %{$data{stats} }; %dates = %{ $data{dates} }; @dates = @{ $data{a_dates} }; %agent = %{ $data{agent} }; %agent_level = %{ $data{agent_level} }; %misc_browsers = %{ $data{misc_browsers} }; %platform = %{ $data{platform} }; print $MSG "\nHashes read.\n"; } ### # ## Sort the data ## # ### sub sort_data { print $MSG "Sorting data . . ."; @topreqs = sort { $requests{$b} <=> $requests{$a}; } keys %requests; print $MSG " . . ."; @topclients = sort { $clients{$b} <=> $clients{$a}; } keys %clients; print $MSG " . . ."; @peaktimes = sort { $hph{$b} <=> $hph{$a}; } keys %hph; print $MSG " . . ."; @browsers = sort { $misc_browsers{$b} <=> $misc_browsers{$a}; } keys %misc_browsers; @toprefer = sort { $refers{$b} <=> $refers{$a}; } keys %refers; print $MSG " . . ."; @top_agent = sort { $agent{$b} <=> $agent{$a}; } keys %agent; print $MSG " . . ."; @top_level = sort { $agent_level{$b} <=> $agent_level{$a}; } keys %agent_level; print $MSG " . . ."; @top_platform = sort { $platform{$b} <=> $platform{$a}; } keys %platform; print $MSG " . . . done\n"; $n = 0; print $MSG "Calculating Hits per Hour\n"; while(($atime, $hitsper) = each(%hph)) { $n++; $total += $hitsper; } $hitsperhr = ($n > 0) ? int($total / $n) : 0; $i = 0; print $MSG "Calculating unique URLs\n"; while($requests{$topreqs[$i]} != 0) { $uniqueURL++; $i++; } $i = 0; print $MSG "Calculating unique clients\n"; while($clients{$topclients[$i]} != 0) { $uniqueClients++; $i++; } if($#top_agent > 0) { $i = 0; print "Calculating user agents\n"; while($agent{$top_agent[$i]} != 0) { $uniqueAgent++; $i++; } } print $MSG "Calculating Directory totals\n"; foreach(@topreqs) { if(/^(\/[^\/]+)\/.*$/) { $dirname = $1; $dir_cnt{$dirname} += $requests{$_}; } } @top_dirs = sort { $dir_cnt{$b} <=> $dir_cnt{$a}; } keys %dir_cnt; } sub make_html { print $MSG "Generating HTML\n"; $html_title =~ s/(\$\w+)/$1/gee; if($outfile =~ /stdout/i) { $fh_out = *STDOUT; } else { open(HTML, ">$outfile") || die "Cannot open '$outfile' for write: $!"; $fh_out = *HTML; } unless($server) { $server = "???"; } unless($html_title) { $html_title = "Web Statistics for '$server'"; } print { $fh_out } << "XX"; $html_title

Webstats for '$server'

from $beg_date to $end_date
Total Hits$hits
Average Hits per Hour$hitsperhr
Total 200's$stats{200}
Total 304's$stats{304}
Total 404's$stats{404}
Total Kilobytes Transfered$totalK
Unique URLs$uniqueURL
Unique Hosts$uniqueClients
Top Requests | Top Directories | Top Clients | XX if($#toprefer > 0) { print $fh_out "Top Referrers |\n"; } print { $fh_out } << "XX"; Peak Hours

XX &make_top_requests; &make_top_dirs; &make_top_clients; if($#toprefer > 0) { &make_top_referers; } &make_peak_hours; if($#top_agent > 0) { &make_agent_info; &list_other_browsers; } if($graph) { &make_hourly_graph; } print $fh_out "


\n"; print $fh_out "Statistics generated by Superstat ©1999 Pete Nelson\n"; print $fh_out "\n"; print $MSG "'$outfile' generated.\n"; } ### # ## sub make_hourly_graph ## # ### sub make_hourly_graph { $img_dir =~ s|/$||; ## remove trailing slash if there. print { $fh_out } << "XX";

Hourly Break-down

Average Hits per Hour - $hitsperhr

- Less than Average Peak - Greater than Average Peak
XX foreach $today (@dates) { print $fh_out "\n\n"; print $fh_out "\n"; for $i (0 .. 23) { $uhr = ($i < 10) ? "0$i" : $i; $hashkey = "$today-$uhr"; $ht = $hph{$hashkey} | 0; $h = ($i > 12) ? int($i - 12) : $i; $mid = ($i < 12) ? "AM" : "PM"; if($h == 0) { $h = 12; } $imgwidth = int($ht * (400 / $hph{$peaktimes[0]})); if($ht <= $hitsperhr) { $color = "red"; } else { $color = "green"; } if($ht > 0) { print $fh_out "\n\n"; print $fh_out "\n"; print $fh_out "\n"; } } print $fh_out "
$today
$h $mid$ht\n"; print $fh_out "
\n"; } } ## sub make_top_requests # sub make_top_requests { print { $fh_out } << "XX";

Top Requests:

XX $i = 0; while($requests{$topreqs[$i]} > $least_hits) { if(&is_hidden($topreqs[$i])) { $i++; next; } print { $fh_out } << "XX"; XX $i++; } print { $fh_out } << "XX";
Number of Hits Requested URL
$requests{$topreqs[$i]} $topreqs[$i]

XX } ## sub make_top_dirs ## # sub make_top_dirs { print { $fh_out } << "XX";

Most frequently Visited Directories

XX $max = ($#top_dirs > $max_dirs) ? $max_dirs : $#top_dirs; for $i (0 .. $max) { unless(&is_hidden($top_dirs[$i] . '/')) { print { $fh_out } << "XX"; XX } } print $fh_out "
Number of hits Directory
$dir_cnt{$top_dirs[$i]} $top_dirs[$i]/
\n
\n"; } ## sub make_top_clients ## # sub make_top_clients { print { $fh_out } << "XX";

Top $max_clients Clients

XX $max = ($#topclients > $max_clients) ? $max_clients : $#topclients; for $i (0 .. $max) { $host = ($hostlookup && (!$NO_SOCK)) ? &get_hostname($topclients[$i]) : $topclients[$i]; print { $fh_out } << "XX"; XX } print { $fh_out } "
Number of Hits Client IP
$clients{$topclients[$i]} $host
\n
\n"; } ## sub make_top_referers ## # sub make_top_referers { print { $fh_out } << "XX";

Top $max_referers Referrers

XX $max = ($#toprefer > $max_referers) ? $max_referers : $#toprefer; for $i (0 .. $max) { if(&is_hidden($toprefer[$i])) { $i++; next; } print { $fh_out } << "XX"; XX } print $fh_out "
Number of Hits Refering URL
$refers{$toprefer[$i]} $toprefer[$i]
\n
\n"; } ## sub make_peak_hours ## # sub make_peak_hours { print { $fh_out } << "XX";

Peak Hours

XX for $i (0 .. 20) { ($tdate, $hour) = split/-/, $peaktimes[$i]; $mid = ($hour < 12) ? "AM" : "PM"; if($mid eq "PM") { $hour -= 12; } if($hour == 0) { $hour = 12; } print { $fh_out } << "XX"; XX } print $fh_out "
Number of Hits Date Time
$hph{$peaktimes[$i]} $tdate $hour $mid
\n
\n"; } ## sub make_agent_info ## # sub make_agent_info { print { $fh_out } << "XX";

User Agent Info

\n\n\n\n

Browser Version

XX for $i (0 .. $#top_agent) { print $fh_out "\n \n"; print $fh_out " \n\n"; } print $fh_out "
Hits User Agent
$agent{$top_agent[$i]}$top_agent[$i]
\n
\n

Browser Level

\n"; print $fh_out "\n\n"; print $fh_out " \n \n\n"; for $i (0 .. $#top_level) { print $fh_out "\n \n"; print $fh_out " \n\n"; } print $fh_out "
HitsBrowser Level
$agent_level{$top_level[$i]}$top_level[$i]
\n
\n

Platform

\n"; print $fh_out "\n\n"; print $fh_out " \n \n\n"; for $i (0 .. $#top_platform) { print $fh_out "\n \n"; print $fh_out " \n\n"; } print $fh_out "
HitsPlatform
$platform{$top_platform[$i]}$top_platform[$i]
\n
"; } sub list_other_browsers { print { $fh_out } << "XX";

Other Browsers

XX foreach(@browsers) { print $fh_out "\n"; } print $fh_out "
Number of HitsUser Agent
$misc_browsers{$_}$_
\n"; } sub is_hidden { my($dir) = shift; foreach(@hidden_dirs) { if($dir =~ m#$_#) { return 1; } } return 0; } sub mod_warn { my($mod) = shift; print "Warning: '$mod' module not found!\n"; 1; } sub parse_config { my($file) = shift; my(%conf); open(CONF, $file) || die ("Couldn't read config file '$file'"); @lines = ; close(CONF); foreach(@lines) { unless(/^#/) { if(/^(\w+)\s+['|"]?(.*)['|"]?$/) { chomp($2); $conf{$1} = $2; } } } foreach(keys(%conf)) { $conf{$_} =~ s/^['|"]//g; $conf{$_} =~ s/['|"]$//g; } unless(defined($graph)) { $graph = $conf{graph}; } unless(defined($hostlookup)) { $hostlookup = $conf{hostlookup}; } unless(defined($use_meta_default)) { $use_meta_default = $conf{use_meta_default}; } $server ||= $conf{server}; $least_hits ||= $conf{least_hits}; $max_dirs ||= $conf{max_dirs}; $max_clients ||= $conf{max_clients}; $html_title ||= $conf{html_title}; $body_def ||= $conf{body_def}; $table_def ||= $conf{table_def}; $img_dir ||= $conf{img_dir}; $table_header_def ||= $conf{table_header_def}; my(@hidden) = split(/ /, $conf{hidden_dirs}); push(@hidden_dirs, @hidden); 1; } __END__ =head1 NAME superstat - generates a statistical report from an HTTP access log. =head1 URL http://www.serversolved.com/superstat/ =head1 SYNOPSIS B [B<--infile, --outfile, --config>=I] [B<--hidden_dirs>=I] [B<--least_hits, --max_clients, --max_dirs, --max_referers>=I] [B<--body_def, --table_def --table_header_def>=I] [B<--[no]graph, --[no]hostlookup>] [B<--use_meta_default, --(silent|quiet)>] [B<--server, --title>=I] Options are: B<--infile>=I use logfile file --outfile=file generate HTML to file --config=file read config from file --hidden_dirs=list Don't display URLs matching anything in list --least_hits=num Display only pages with hits over num --max_clients=num Display top num clients --max_dirs=num Display top num directories --max_referers=num Display top num referers --[no]graph Create an hourly-breakdown graph --[no]hostlookup Lookup hostnames for Top Clients --use_meta_default if infile.meta exists, use the meta file. HTML options: --server=server The name of the server --title=title Use title for the HTML title --body_def=def HTML definition for the document body --table_def=def HTML definition for the tables --table_header_def=def HTML definition for the table headers =head1 DESCRIPTION Superstat takes a server's access log, and generates statistics including top requests, most frequently visited directories, top clients, and even and hourly breakdown of activity. The most common way to use superstat is with the command: superstat --infile=access.archive --outfile=stats.html (If you don't have the 'Getopt::Long' module for perl, you would use the command 'superstat access.archive stats.html' to achive the same effect) It is highly advised that you not run superstat on an active log file! This is just a bad idea in general. You should alway rotate (archive) the access log or at least make a copy of the log to analyze. If you run this program on an active log file, it is quite possible your server will stop writing to it! Superstat also creates a metafile through the 'Storable' module for perl to save the hashes generated from parsing the log. (It's a very strait-forward use of Storable - see Storable's documentation for more details.) The metafile is handy for changing the formatting of the output file without having to re-parse the log file again. As far as I know, you can use any option on a metafile and receive the same results as reparsing the log. The one exception would be --infile (duh!). If you find other options that work differently on a metafile than on the actual log file, please email me and let me know. A Note on Modules This program uses three modules by default. You can still analyze access logs and generate a web page without these modules, but you'll have to make a few simple changes, and you'll miss out on a few features! Storable used to create a 'meta' file for faster loading. Comment out the 'use Storable;' line, and add '$NO_STOR = 1;' Socket used for performing hostlookups. Comment out the 'use Socket;' line, and add '$NO_SOCK = 1;' Getopt::Long used to parse command line options. Comment out the 'use Getopt::Long;' and add '$NO_OPT = 1;' (you can still use all the options by including them in the config file) Of course, all three modules are freely available from CPAN. =head1 SEE ALSO The B that should have come with this program. An HTML version is available at http://www.serversolved.com/superstat/ =head1 AUTHOR Pete Nelson