Sophie

Sophie

distrib > Mandriva > 2010.1 > x86_64 > media > contrib-release > by-pkgid > e1c01a318d77e5f1a6fe645ebc4f812f > files > 32

dspam-3.9.0-3mdv2010.1.x86_64.rpm

#!/usr/bin/perl

use strict;
use vars qw { @archives $user };

@archives = ( "easy_ham", "hard_ham", "spam", "spam_2" );

print "SpamAssassin Public Corpus Trainer v0.1.0\n\n";
$user = shift;
if ($user eq "") { 
  print "Syntax: $0 [username]\n";
  exit(-1);
}

foreach(@archives) {
  print   "Searching for corpus $_ ...\n";
  if (-d $_) {
    print "...found it!\n";
    print "Training with corpus $_ ..."; 
    &Train($user, $_);
    print "...done!\n";
  } else {
    print "...not found.\n";
  }
}

print "Training complete.\n";
print "Now run \"dspam_clean -p0 $user\" to purge uninteresting data\n";

sub Train {
  my($user, $corpus) = @_;
  my(@files, $file, $cmd, $class);

  opendir(DIR, "$corpus");
  @files = grep(!/^\.\.?$/, readdir(DIR));
  closedir(DIR);

  if ($corpus =~ /ham/) {
    $class = "innocent";
  } elsif ($corpus =~ /spam/) {
    $class = "spam";
  } else {
    print "Unable to determine whether $corpus is ham or spam. Skipping.\n";
    return;
  } 

  foreach $file (@files) {
    my($ret);
    next if ($file eq "cmds");
    $cmd = "dspam --user $user --class=$class --source=corpus < $corpus/$file";
    $ret = system($cmd);
    print "Command returned error $ret: $cmd\n" if ($ret);
  }

  return;
}