#!/usr/bin/perl

#
# amavislogsumm - analyse amavisd-new logfiles. Read from STDIN
#
# amavislogsumm needs logfiles from amavisd-new with a minimum
# loglevel of 2
#

# SYNOPSIS
# amavislogsumm [options] < amavis.log
#

# OPTIONS
#
#     -S
#     --show_sender     also list Spam/Virus by Sender
#     
#     -h <cnt>          top <cnt> to display in reports
#
#     -t <threshold>    threshold of percent in Tests-Report
#
#     --help            Emit short usage message and bail out
#
#     -R
#     --noranking       dont list Tests-Report
#
#     -B
#     --nobayes         dont list Bayes-Report
#

############################################################################
#
# Changelog
#
# 17 Feb. 2004
# * qouted out Date::Calc - it's not used
# * fixed division by zero at `printf "\tSPAM     : %5.2f\n", ($spamHitSum / $spamCount);'
# * help on commandline fixed
# * cosmetics on Source and Output
# * handle comma separeted lists of recipients
#
# 9 Mar. 2004
# * change Totals to sum up properly
#
# 27 Apr. 2004
# * removed unused Code
# * renamed some Variables
# * fixed error in Average Score Calculation
#


use strict;
use warnings;
use Getopt::Long;


my $progName = "amavislogsumm";
my $version = "0.6";

my $usageMsg = 
    "usage: $progName [-d <today|yesterday>] [-h <cnt>]
        [-S|--show_sender] [ -i <cnt> ] [ -t <threshold> ]
        [ --help ]
        [ < file]

        $progName --[version|help]";

my @monthNames = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
my %monthNums = qw(
Jan  0 Feb  1 Mar  2 Apr  3 May  4 Jun  5
Jul  6 Aug  7 Sep  8 Oct  9 Nov 10 Dec 11);
my ($thisMon, $thisYr) = (localtime(time()))[4,5];
$thisYr += 1900;

# BAYES test counting
my %spamBayes = qw(00 0 01 0 10 0 20 0 30 0 40 0 44 0 50 0 56 0 60 0 70 0 80 0 90 0 99 0);
my %noneSpamBayes = qw(00 0 01 0 10 0 20 0 30 0 40 0 44 0 50 0 56 0 60 0 70 0 80 0 90 0 99 0);

my (
    %opts, 

    $timeLineCount,

    $lineDate,

    $virus,
    $sender, $recipient,

    $unscoredMail,

    $scoreInterval, @scores,

    $hits, $tests,
    $spamCount,
    $isSpamMail,
    $spamHitSum,
    $noneSpamHitSum,
    $maxSpamScore, $minSpamScore,
    $maxNoneSpamScore, $minNoneSpamScore,

    %spamBySender, %spamByRecipient,
    %spamTests, %noneSpamTests,

    $virusCount, %virus,
    $isVirusMail,
    $virusTimeSum,

    %virusBySender, %virusByRecipient,

    $month, $date, $time_H, $time_M, $time_S, $host, $process, $pid, $logLine,

    $timeSum
);

GetOptions(
    "d=s"           => \$opts{'d'},
    "i=i"           => \$opts{'i'},
    "t=i"           => \$opts{'t'},
    "help"          => \$opts{'help'},
    "version"       => \$opts{'version'},
    "S"             => \$opts{'S'},
    "show_sender"   => \$opts{'S'},
    "R"             => \$opts{'noranking'},
    "noranking"     => \$opts{'noranking'},
    "B"             => \$opts{'nobayes'},
    "nobayes"       => \$opts{'nobayes'},
    "help"          => \$opts{'help'},
    "h=i"           => \$opts{'h'}
) || die "$usageMsg\n";


if (defined($opts{'help'})) {
    print "$usageMsg\n";
    exit 0;
}

if (defined($opts{'version'})) {
    print "$progName $version\n";
    exit 0;
}

if (defined($opts{'i'})) {
    if ($opts{'i'} < 1) {
        print "Value for option i must be > 0\n";
        exit 0;
    }
    $scoreInterval = $opts{'i'};
} else {
    $scoreInterval = 10;
}

$opts{'h'} = -1 unless(defined($opts{'h'}));

# threshold for Spamtestlist
$opts{'t'} = 1 unless(defined($opts{'t'}));
$opts{'t'} = 1 if ($opts{'t'} < 1);


$timeLineCount = 0;

$unscoredMail = 0;
$isSpamMail = 0;
$isVirusMail = 0;
$maxSpamScore = -1000;
$minSpamScore = 1000;
$maxNoneSpamScore = -1000;
$minNoneSpamScore = 1000;

# Total Time used to scan mails
$timeSum = 0;

# Number of found Viruses
$virusCount = 0;

# Time used scanning infected Mails
$virusTimeSum = 0;

# Sum of hitpoints for spam
$spamHitSum = 0;

# Sum of hitpoints for nonespam
$noneSpamHitSum = 0;

# Number of Spammails
$spamCount = 0;

# Number of Virusmails
$virusCount = 0;


sub workTiming {
    my $time = shift @_;
    my $virusTime = shift @_;
    my $spamTime = shift @_;

    $timeSum += $time;
    $timeLineCount++;
    if ($virusTime) {
        $virusTimeSum += $time;
    }
}

sub workVirus {
    my $sender = shift @_;
    my $recipient = shift @_;
    my $virusname = shift @_;

    $virusCount++;
    $virus{$virusname}++;
    $virusBySender{$sender}++;

    $recipient =~ s/[<>]//g;
    foreach my $r (split(',', $recipient)) {
        $virusByRecipient{$r}++;
    }
}

sub workSpam {
    my $sender = shift @_;
    my $recipient = shift @_;

    $spamCount++;
    $spamBySender{$sender}++;

    $recipient =~ s/[<>]//g;
    foreach my $r (split(',', $recipient)) {
        $spamByRecipient{$r}++;
    }
}

sub showTiming {
    heading("Timings");
    printf "\tSeconds total: %.2f\n\tMinutes total: %.2f\n", ($timeSum / 1000), ($timeSum / 1000 / 60);
    printf "\tSeconds per Mail: %.2f\n", ($timeSum / 1000 / $timeLineCount);
    printf "\tSeconds for infected Mails: %.2f\n", ($virusTimeSum / 1000);
    printf "\tSeconds per infected Mail: %.2f\n", ($virusTimeSum / 1000 / $virusCount) if ($virusCount > 0);
}

sub listHashOrderedByValue {
    my $hashRef = shift @_;
    my @keys = sort { $hashRef->{$b} <=> $hashRef->{$a} } keys %$hashRef;
    my $i = 1;
    foreach my $key (@keys) {
        printf "\t%5i : %s\n", $hashRef->{$key}, $key;
        last if ($opts{'h'} > 0 and $opts{'h'} <= $i++);
    }
}

sub listHashOrderedByValuePercent {
    my $hashRef = shift @_;
    my $reference = shift @_;
    my @keys = sort { $hashRef->{$b} <=> $hashRef->{$a} } keys %$hashRef;
    my $i = 1;
    my $percent;
    foreach my $key (@keys) {
        $percent = $hashRef->{$key} / $reference * 100;
        last if ($percent <= $opts{'t'});
        printf "\t%.2f %% %5i : %s\n", $percent, $hashRef->{$key}, $key;
        last if ($opts{'h'} > 0 and $opts{'h'} <= $i++);
    }
}

sub heading {
    my $headline = shift @_;
    print "\n" . $headline . "\n";
    for ( my $i = 0; $i < length($headline); $i++) {
        print '-';
    }
    print "\n";
}

sub showVirus {
    heading("found Virus by occurrence:");
    listHashOrderedByValue(\%virus);

    if (defined($opts{'S'})) {
        heading("found Virus by Sender:");
        listHashOrderedByValue(\%virusBySender);
    }

    heading("found Virus by Recipient:");
    listHashOrderedByValue(\%virusByRecipient);
}

sub showSpam {
    heading("Spam by Recipient:");
    listHashOrderedByValue(\%spamByRecipient);
    if (defined($opts{'S'})) {
        heading("Spam by Sender");
        listHashOrderedByValue(\%spamBySender);
    }
}

sub startStats {
    heading("Totals:");
    printf "\t%5i : Mails\n", $timeLineCount;
    print "\tthereof\n";
    printf "\t%5i : INFECTED\n", $virusCount;
    printf "\t%5i : SPAM\n", $spamCount;
    printf "\t%5i : unscored\n", $unscoredMail;
    printf "\t%5i : clean\n", ($timeLineCount - ($spamCount + $virusCount + $unscoredMail));
    print "\n";
    print "\tAverage Score\n";
    if ($spamCount > 0) {
        printf "\tSPAM : %5.2f\n", ($spamHitSum / $spamCount); 
    }
    if (($timeLineCount - $spamCount) > 0) { 
        printf "\tHAM  : %5.2f\n", ($noneSpamHitSum / ($timeLineCount - $spamCount - $unscoredMail)); 
        print "\n";
    }
    printf "\tMAX Spamscore : %5.2f\n", $maxSpamScore;
    printf "\tMIN Spamscore : %5.2f\n", $minSpamScore;
    print "\n";
    printf "\tMAX Hamscore : %5.2f\n", $maxNoneSpamScore;
    printf "\tMIN Hamscore : %5.2f\n", $minNoneSpamScore;
    
    print "\n\tScores of Spammails\n\tScore : Count\n\t-------------\n";
    for (my $i = 0; $i <= $#scores; $i++) {
        if (defined($scores[$i])) {
            printf "\t%2i-%2i : %i\n", $i * $scoreInterval, ($i + 1) * $scoreInterval, $scores[$i];
        }
    }

    if (not defined($opts{'nobayes'})) {
        print "\n\tBayes Tests in Spammails\n";
        print "\tprobability : count\n";
        print "\t-------------------\n";
        foreach my $key (sort keys %spamBayes) {
            printf "\t%s : %2i\n", $key, $spamBayes{$key}; 
        }

        print "\n\tBayes Tests in Hammails\n";
        print "\tprobability : count\n";
        print "\t-------------------\n";
        foreach my $key (sort keys %noneSpamBayes) {
            printf "\t%s : %2i\n", $key, $noneSpamBayes{$key}; 
        }
    }
}


sub workUnscoredMail {
  $unscoredMail++;
}

sub workScoreTests {
  my $score = shift @_;
  my $tests = shift @_;
  my $isSpam = shift @_;

  if ($isSpam) {

    my $section = sprintf "%i", $score / $scoreInterval;
    $scores[$section]++;

    # Count the passed Tests
    foreach my $test (split ',', $tests) {
      $spamTests{$test}++;
    }

    $spamHitSum += $score;
    if ($tests =~ m/BAYES_(\d\d)/) {
      $spamBayes{$1}++;
    }

    if ($score > $maxSpamScore) {
      $maxSpamScore = $score;
    } elsif ($score < $minSpamScore) {
      $minSpamScore = $score;
    }

  } else {

    if ($score > $maxNoneSpamScore) {
      $maxNoneSpamScore = $score;
    } elsif ($score < $minNoneSpamScore) {
      $minNoneSpamScore = $score;
    }

    foreach my $test (split ',', $tests) {
      $noneSpamTests{$test}++;
    }

    $noneSpamHitSum += $score;
    if ($tests =~ m/BAYES_(\d\d)/) {
      $noneSpamBayes{$1}++;
    }

  }
}

sub showTestsRanking {
    #
    # List Number of Spammails that hit this Test
    # number : testname
    #
    heading("Ranking of Tests in Spammails:");
    listHashOrderedByValuePercent(\%spamTests, $spamCount);

    #
    # List Number of Nonespammails that hit this Test
    # number : testname
    #
    heading("Ranking of Tests in Nonespammails:");
    listHashOrderedByValuePercent(\%noneSpamTests, $timeLineCount - $spamCount);
}


while (not eof(STDIN)) {
    undef($isSpamMail);
    undef($isVirusMail);
    undef($hits);
    do {
        $logLine = <>;
        next if (! defined($logLine));

        # spam_scan: hits=21.326 tests=<list of matched tests>
        if ($logLine =~ m/spam_scan: hits=(-?\d+\.?\d+) tests=(.*)/) {
            $hits = $1 unless (defined($hits));
            $tests = $2;
        }

        # SPAM-TAG, <sender@domain.tld> -> <recipient@domain.tld>, Yes, hits=21
        if ($logLine =~ m/SPAM-TAG, <(.*)> -> <(.*)>, Yes, hits=(\d+)/) {
            $isSpamMail = 1;
            $sender = $1;
            $recipient = $2;
            $hits = $3;
        }

        # SPAM, <sender@domain.tld> -> <recipient@domain.tld>, Yes, hits=9.0
        if ($logLine =~ m/SPAM, <(.*)> -> <(.*)>, Yes, hits=(\d+)/) {
            $isSpamMail = 1;
            $sender = $1;
            $recipient = $2;
            $hits = $3;
        }

        # INFECTED (I-Worm.Ganda ), <sender@domain.tld> -> <recipient@domain.tld>
        if ($logLine =~ m/INFECTED \((.*)\), <(.*)> -> <(.*)>,/) {
          $virus = $1;
          $sender = $2;
          $recipient = $3;
          $isVirusMail = 1;
        }

   } while ($logLine !~ m/TIMING\s\[total\s(\d+)\sms\]/);
   workTiming($1, defined($isVirusMail), defined($isSpamMail));
   workVirus($sender, $recipient, $virus) if defined($isVirusMail);
   workSpam($sender, $recipient) if defined($isSpamMail);
   if (defined($hits)) {
     workScoreTests($hits, $tests, defined($isSpamMail))
   } else {
     workUnscoredMail();
   }
}


if ($timeLineCount == 0) {
    print "no usable logdata found\n";
    print "please make sure you are running amavisd-new at minimum loglevel 2\n";
    exit 1;
}
startStats();
showTiming();
showTestsRanking() unless (defined($opts{'noranking'}));
showVirus() if ($virusCount > 0);
showSpam();

exit 0;
