X-Git-Url: http://gb7djk.dxcluster.net/gitweb/gitweb.cgi?a=blobdiff_plain;f=perl%2FBadWords.pm;h=141b3e9a729b6a933149cb0bf71bf4a942938285;hb=refs%2Fheads%2Fnew-spawn;hp=9814e3fadd5ef712d25315887d1582e2c509d338;hpb=e0cfa6eebc0cddc92fe3c45636e6f1f641edda1b;p=spider.git diff --git a/perl/BadWords.pm b/perl/BadWords.pm index 9814e3fa..141b3e9a 100644 --- a/perl/BadWords.pm +++ b/perl/BadWords.pm @@ -3,7 +3,7 @@ # # Copyright (c) 2000 Dirk Koopman # -# $Id$ +# # package BadWords; @@ -13,24 +13,26 @@ use strict; use DXUtil; use DXVars; use DXHash; +use DXDebug; + use IO::File; -use vars qw($badword); +use vars qw($badword $regexcode); -my $oldfn = "$main::data/badwords"; -$badword = new DXHash "badword"; +my $oldfn = localdata("badwords"); +my $regex = localdata("badw_regex"); +my $bwfn = localdata("badword"); + +# copy issue ones across +filecopy("$regex.gb.issue", $regex) unless -e $regex; +filecopy("$bwfn.issue", $bwfn) unless -e $bwfn; -use vars qw($VERSION $BRANCH); -$VERSION = sprintf( "%d.%03d", q$Revision$ =~ /(\d+)\.(\d+)/ ); -$BRANCH = sprintf( "%d.%03d", q$Revision$ =~ /\d+\.\d+\.(\d+)\.(\d+)/ ) || 0; -$main::build += $VERSION; -$main::branch += $BRANCH; +$badword = new DXHash "badword"; # load the badwords file sub load { my @out; - return unless -e $oldfn; my $fh = new IO::File $oldfn; if ($fh) { @@ -45,11 +47,46 @@ sub load $fh->close; $badword->put; unlink $oldfn; + } + push @out, create_regex(); + return @out; +} + +sub create_regex +{ + my @out; + my $fh = new IO::File $regex; + + if ($fh) { + my $s = "sub { my \$str = shift; my \@out; \n"; + while (<$fh>) { + chomp; + next if /^\s*\#/; + my @list = split " "; + for (@list) { + # create a closure for each word so that it matches stuff with spaces/punctuation + # and repeated characters in it + my $w = uc $_; + my @l = split //, $w; + my $e = join '+[\s\W]*', @l; + $s .= "push \@out, \$1 if \$str =~ /\\b($e)/;\n"; + } + } + $s .= "return \@out;\n}"; + $regexcode = eval $s; + dbg($s) if isdbg('badword'); + if ($@) { + @out = ($@); + dbg($@); + return @out; + } + $fh->close; } else { - my $l = "can't open $oldfn $!"; + my $l = "can't open $regex $!"; dbg($l); push @out, $l; } + return @out; } @@ -57,32 +94,17 @@ sub load sub check { my $s = uc shift; + my @out; + + push @out, &$regexcode($s) if $regexcode; - for (split(/\s+/, $s)) { - s/[^\w]//g; - return $_ if $badword->in($_); - s/\'?S$//; - return $_ if $badword->in($_); - } + return @out if @out; - # look for a few of the common ones with spaces and stuff - if ($s =~ /F[\s\W]*U[\s\W]*C[\s\W]*K/) { - return "FUCK"; - } elsif ($s =~ /C[\s\W]*U[\s\W]*N[\s\W]*T/) { - return "CUNT"; - } elsif ($s =~ /W[\s\W]*A[\s\W]*N[\s\W]*K/) { - return "WANK"; - } elsif ($s =~ /C[\s\W]*[0O][\s\W]*C[\s\W]*K/) { - return "COCK"; - } elsif ($s =~ /S[\s\W]*H[\s\W]*[I1][\s\W]*T/) { - return "SHIT"; - } elsif ($s =~ /P[\s\W]*[I1][\s\W]*S[\s\W]*S/) { - return "PISS"; - } elsif ($s =~ /B[\s\W]*[O0][\s\W]*L[\s\W]*L[\s\W]*[O0][\s\W]*[CK]/) { - return "BOLLOCKS"; + for (split(/\b/, $s)) { + push @out, $_ if $badword->in($_); } - - return (); + + return @out; } 1;