2 # Search for bad words in strings
4 # Copyright (c) 2000 Dirk Koopman
19 our $regex; # the big bad regex generated from @relist
20 our @relist; # the list of regexes to try, record = [canonical word, regex]
21 my %in; # the collection of words we are building up and their regexes
24 # load the badwords file(s)
36 my $newfn = localdata("badword.new");
37 filecopy("$main::data/badword.new.issue", $newfn) unless -e $newfn;
40 dbg("BadWords: Found new style badword.new file");
41 my $fh = new IO::File $newfn;
49 @relist = sort {$a->[0] cmp $b->[0]} @relist; # just in case...
50 dbg("BadWords: " . scalar @relist . " new style badwords read");
53 my $l = "BadWords: can't open $newfn $!";
61 # using old style files
62 my $bwfn = localdata("badword");
63 filecopy("$main::data/badword.issue", $bwfn) unless -e $bwfn;
65 # parse the existing static file
66 dbg("BadWords: Using old style badword file");
68 my $fh = new IO::File $bwfn;
75 unless (/\w+\s+=>\s+\d+,/) {
76 dbg("BadWords: syntax error in $bwfn:$line '$_'");
79 my @line = split /\s+/, uc $_;
80 shift @line unless $line[0];
86 my $l = "BadWords: can't open $bwfn $!";
92 # do the same for badw_regex
93 my $regexfn = localdata("badw_regex");
94 filecopy("$main::data/badw_regex.gb.issue", $regexfn) unless -e $regexfn;
95 dbg("BadWords: Using old style badw_regex file");
96 $fh = new IO::File $regexfn;
103 push @inw, split /\s+/, uc $_;
108 my $l = "BadWords: can't open $regexfn $!";
117 # catch most of the potential duplicates
123 # create the master regex
126 # use new style from now on
136 @relist = sort {$a->[0] cmp $b->[0]} @relist;
138 $res .= qq{\\b(?:$_->[1]) |\n};
140 $res =~ s/\s*\|\s*$//;
141 $regex = qr/\b($res)/x;
148 $w =~ tr/01/OI/; # de-leet any incoming words
149 my $last = ''; # remove duplicate letters (eg BOLLOCKS > BOLOCKS)
156 return @w ? join('', @w) : '';
161 my @list = split /\s+/, shift;
168 next unless $w && $w =~ /^\w+$/; # has to be a word
169 next if $in{$w}; # ignore any we have already dealt with
170 next if _slowcheck($w); # check whether this will already be detected
172 # re-leet word (in regex speak)if required
173 my @l = map { s/O/[O0]/g; s/I/[I1]/g; $_ } split //, $w;
174 my $e = join '+[\s\W]*', @l;
176 push @relist, [$w, $q];
178 dbg("$w = $q") if isdbg('badword');
186 my @list = split /\s+/, shift;
194 @relist = grep {$_->[0] ne $w} @relist;
203 return map { $full ? "$_->[0] = $_->[1]" : $_->[0] } @relist;
206 # check the text against the badwords list
214 @out = grep {++$uniq{$_}; $uniq{$_} == 1 ? $_ : undef }($s =~ /($regex)/g);
215 dbg("BadWords: check '$s' = '" . join(', ', @out) . "'") if isdbg('badword');
218 return _slowcheck($s) if @relist;
229 push @out, $w =~ /\b($_->[1])/;
234 # write out the new bad words list
238 my $newfn = localdata("badword.new");
239 my $fh = new IO::File ">$newfn";
241 dbg("BadWords: put new badword.new file");
242 @relist = sort {$a->[0] cmp $b->[0]} @relist;
244 print $fh "$_->[0]\n";
249 my $l = "BadWords: can't open $newfn $!";