X-Git-Url: http://gb7djk.dxcluster.net/gitweb/gitweb.cgi?a=blobdiff_plain;f=perl%2FBadWords.pm;h=09911b4ad5748bfde84a4e502aac9a823c044c48;hb=606b9690c7bb9e293cb4a03507f0280eea05d9ce;hp=312a04082c705fac550f9b0fcb065388cca2e564;hpb=6ec22e78a4a344ce675645fabf18b2a1971f364a;p=spider.git diff --git a/perl/BadWords.pm b/perl/BadWords.pm index 312a0408..09911b4a 100644 --- a/perl/BadWords.pm +++ b/perl/BadWords.pm @@ -12,85 +12,244 @@ use strict; use DXUtil; use DXVars; -use DXHash; use DXDebug; use IO::File; -use vars qw($badword $regexcode); +our $regex; # the big bad regex generated from @relist +our @relist; # the list of regexes to try, record = [canonical word, regex] +my %in; # the collection of words we are building up and their regexes -our $regex; -# load the badwords file +# load the badwords file(s) sub load { - my $bwfn = localdata("badword"); - filecopy("$main::data.issue", $bwfn) unless -e $bwfn; - - my @out; + %in = (); + @relist = (); + $regex = ''; - $badword = new DXHash "badword"; + my @inw; + my @out; + my $wasold; - push @out, create_regex(); - return @out; -} -sub create_regex -{ - $regex = localdata("badw_regex"); - filecopy("$regex.gb.issue", $regex) unless -e $regex; + my $newfn = localdata("badword.new"); + filecopy("$main::data/badword.new.issue", $newfn) unless -e $newfn; + if (-e $newfn) { + # new style + dbg("BadWords: Found new style badword.new file"); + my $fh = new IO::File $newfn; + if ($fh) { + while (<$fh>) { + chomp; + next if /^\s*\#/; + add_regex(uc $_); + } + $fh->close; + @relist = sort {$a->[0] cmp $b->[0]} @relist; # just in case... + dbg("BadWords: " . scalar @relist . " new style badwords read"); + } + else { + my $l = "BadWords: can't open $newfn $!"; + dbg($l); + push @out, $l; + return @out; + } + } + else { + + # using old style files + my $bwfn = localdata("badword"); + filecopy("$main::data/badword.issue", $bwfn) unless -e $bwfn; - my @out; - my $fh = new IO::File $regex; + # parse the existing static file + dbg("BadWords: Using old style badword file"); - if ($fh) { - my $s = "sub { my \$str = shift; my \@out; \n"; - while (<$fh>) { - chomp; - next if /^\s*\#/; - my @list = split " "; - for (@list) { - # create a closure for each word so that it matches stuff with spaces/punctuation - # and repeated characters in it - my $w = uc $_; - my @l = split //, $w; - my $e = join '+[\s\W]*', @l; - $s .= qq{push \@out, \$1 if \$str =~ m|\\b($e+)|;\n}; + my $fh = new IO::File $bwfn; + if ($fh) { + my $line = 0; + while (<$fh>) { + chomp; + ++$line; + next if /^\s*\#/; + unless (/\w+\s+=>\s+\d+,/) { + dbg("BadWords: syntax error in $bwfn:$line '$_'"); + next; + } + my @line = split /\s+/, uc $_; + shift @line unless $line[0]; + push @inw, $line[0]; } + $fh->close; } - $s .= "return \@out;\n}"; - $regexcode = eval $s; - dbg($s) if isdbg('badword'); - if ($@) { - @out = ($@); - dbg($@); + else { + my $l = "BadWords: can't open $bwfn $!"; + dbg($l); + push @out, $l; return @out; } - $fh->close; - } else { - my $l = "can't open $regex $!"; - dbg($l); - push @out, $l; + + # do the same for badw_regex + my $regexfn = localdata("badw_regex"); + filecopy("$main::data/badw_regex.gb.issue", $regexfn) unless -e $regexfn; + dbg("BadWords: Using old style badw_regex file"); + $fh = new IO::File $regexfn; + + if ($fh) { + while (<$fh>) { + chomp; + next if /^\s*\#/; + next if /^\s*$/; + push @inw, split /\s+/, uc $_; + } + $fh->close; + } + else { + my $l = "BadWords: can't open $regexfn $!"; + dbg($l); + push @out, $l; + return @out; + } + + ++$wasold; } + + # catch most of the potential duplicates + @inw = sort @inw; + for (@inw) { + add_regex($_); + } + + # create the master regex + generate_regex(); + # use new style from now on + put() if $wasold; + + return @out; } +sub generate_regex +{ + my $res; + @relist = sort {$a->[0] cmp $b->[0]} @relist; + for (@relist) { + $res .= qq{(?:$_->[1]) |\n}; + } + $res =~ s/\s*\|\s*$//; + $regex = qr/\b($res)/x; +} + + +sub _cleanword +{ + my $w = uc shift; + $w =~ tr/01/OI/; # de-leet any incoming words + my $last = ''; # remove duplicate letters (eg BOLLOCKS > BOLOCKS) + my @w; + for (split //, $w) { + next if $last eq $_; + $last = $_; + push @w, $_; + } + return @w ? join('', @w) : ''; +} + +sub add_regex +{ + my @list = split /\s+/, shift; + my @out; + + for (@list) { + my $w = uc $_; + $w = _cleanword($w); + + next unless $w && $w =~ /^\w+$/; # has to be a word + next if $in{$w}; # ignore any we have already dealt with + next if _slowcheck($w); # check whether this will already be detected + + # re-leet word (in regex speak)if required + my @l = map { s/O/[O0]/g; s/I/[I1]/g; $_ } split //, $w; + my $e = join '+[\s\W]*', @l; + my $q = $e; + push @relist, [$w, $q]; + $in{$w} = $q; + dbg("$w = $q") if isdbg('badword'); + push @out, $w; + } + return @out; +} + +sub del_regex +{ + my @list = split /\s+/, shift; + my @out; + + for (@list) { + my $w = uc $_; + $w = _cleanword($w); + next unless $in{$w}; + delete $in{$w}; + @relist = grep {$_->[0] ne $w} @relist; + push @out, $w + } + return @out; +} + +sub list_regex +{ + my $full = shift; + return map { $full ? "$_->[0] = $_->[1]" : $_->[0] } @relist; +} + # check the text against the badwords list sub check { my $s = uc shift; my @out; - - push @out, &$regexcode($s) if $regexcode; - - return @out if @out; - for (split(/\b/, $s)) { - push @out, $_ if $badword->in($_); + if ($regex) { + my %uniq; + @out = grep {++$uniq{$_}; $uniq{$_} == 1 ? $_ : undef }($s =~ /\b($regex)/g); + dbg("BadWords: check '$s' = '" . join(', ', @out) . "'") if isdbg('badword'); + return @out; } + return _slowcheck($s) if @relist; + return; +} + +sub _slowcheck +{ + my $w = shift; + my @out; + + for (@relist) { + push @out, $w =~ /\b($_->[1])/; + } return @out; } +# write out the new bad words list +sub put +{ + my @out; + my $newfn = localdata("badword.new"); + my $fh = new IO::File ">$newfn"; + if ($fh) { + dbg("BadWords: put new badword.new file"); + @relist = sort {$a->[0] cmp $b->[0]} @relist; + for (@relist) { + print $fh "$_->[0]\n"; + } + $fh->close; + } + else { + my $l = "BadWords: can't open $newfn $!"; + dbg($l); + push @out, $l; + return @out; + } +} 1;