前言
本文代码实现一个中文的敏感词过滤器,事后将筹备好的敏感词写入前缀树数据结构中实现疾速检索,并且节俭内存。个别用于查看注册用户名称、舆论是否蕴含不文化的词汇。
能够判断内容是否蕴含敏感词;找出内容中的敏感词;将内容中的敏感词替换成设置的字符。
运行环境
代码应用了 JDK8 语法,以及测试框架 Jupiter。以下是 Maven 配置:
<properties>
<java.version>1.8</java.version>
<maven.compiler.source>${java.version}</maven.compiler.source>
<maven.compiler.target>${java.version}</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter</artifactId>
<version>RELEASE</version>
<scope>test</scope>
</dependency>
</dependencies>
过滤器源码
import java.util.*;
import java.util.function.Predicate;
/**
* 敏感词过滤器,限中文
*/
public class SensitiveWordFilters {
/**
* 如词典中有敏感词:[敏感, 敏感词]
* true
* ├── 匹配到 [敏感] 完结匹配
* └── 比拟省时,作简略判断用
* false
* ├── 匹配到 [敏感词] 才完结匹配
* └── 绝对费时,然而在替换敏感词的时候,可能替换掉更多匹配数据
*/
private static final boolean SIMPLE_MATCH = false;
/**
* 疏忽字符列表
*/
private static final List<Character> IGNORE_CHAR_LIST = ignoreCharListInit();
/**
* 疏忽局部字符
* 如词典中有敏感词:[敏感词],现验证文本[敏 感 词],也会认定为敏感词,因为疏忽了空格符
* 同样在 重构字典、往字典中加敏感词时也会应用此断言
*/
private static final Predicate<Character> CHAR_IGNORE =
character -> Character.isSpaceChar(character) || IGNORE_CHAR_LIST.contains(character);
/**
* 重构字典
*/
public static void refactoringBy(List<String> sensitiveWordList) {refactor(sensitiveWordList);
}
/**
* 往字典中加敏感词
*/
public static void add(List<String> sensitiveWordList) {sensitiveWordList.forEach(word -> recordToThe(SensitiveWordCache.dictionary, word));
}
/**
* 往字典中加敏感词
*/
public static void add(String sensitiveWord) {recordToThe(SensitiveWordCache.dictionary, sensitiveWord);
}
/**
* true:text 中有敏感词
*/
public static boolean foundIn(String text) {if (isEmpty(text)) {return false;}
for (int i = 0; i < text.length(); i++) {if (checkSensitiveWord(text, i) > 0) {return true;}
}
return false;
}
/**
* 从 text 中找出敏感词
*/
public static Set<String> findOutFrom(String text) {if (isEmpty(text)) {return Collections.emptySet();
}
Set<String> resultSet = new TreeSet<>((o1, o2) -> o1.length() == o2.length() ? o1.compareTo(o2) : o2.length() - o1.length());
for (int i = 0; i < text.length(); i++) {int endIndex = checkSensitiveWord(text, i);
if (endIndex > 0) {resultSet.add(text.substring(i, ++endIndex));
}
}
return resultSet;
}
/**
* 替换 text 中的敏感词,每个字符换一个替换符
*
* @param text 文本
* @param replaceChar 替换符
* @return 替换后的文本
*/
public static String replace(String text, String replaceChar) {Set<String> sensitiveWordSet = findOutFrom(text);
if (sensitiveWordSet.isEmpty()) {return text;}
for (String sensitiveWord : sensitiveWordSet) {text = text.replace(sensitiveWord, replacementOf(replaceChar, sensitiveWord.length()));
}
return text;
}
/**
* 字典缓存
*/
private static class SensitiveWordCache {
/**
* 字典 / 字典根节点
*/
static Node dictionary;
static {dictionary = new Node();
dictionary.children = new HashMap<>(16);
}
private SensitiveWordCache() {}
}
/**
* 重构字典
*
* @param sensitiveWordList 敏感字符列表
*/
private static void refactor(List<String> sensitiveWordList) {Node newDictionary = new Node();
newDictionary.children = new HashMap<>(16);
synchronized (SensitiveWordCache.class) {for (String word : sensitiveWordList) {recordToThe(newDictionary, word);
}
SensitiveWordCache.dictionary = newDictionary;
}
}
/**
* 将敏感字符记录在节点上
*
* @param node 节点
* @param word 敏感字符
*/
private static void recordToThe(Node node, String word) {Objects.requireNonNull(node);
synchronized (SensitiveWordCache.class) {for (int i = 0, lastIndex = word.length() - 1; i < word.length(); i++) {Character key = word.charAt(i);
if (!CHAR_IGNORE.test(key)) {
// 搁置子节点
Node next = node.get(key);
if (Objects.isNull(next)) {next = new Node();
node.putChild(key, next);
}
node = next;
}
if (i == lastIndex) {node.isEnd = true;}
}
}
}
/**
* 从 startIndex 开始匹配敏感字符
*
* @param text 文本
* @param startIndex 文本起始地位
* @return 0- 没有敏感字符,>0 敏感字符终止地位
*/
private static int checkSensitiveWord(String text, int startIndex) {
int endIndex = 0;
Node node = SensitiveWordCache.dictionary;
for (int i = startIndex; i < text.length(); i++) {Character key = text.charAt(i);
if (CHAR_IGNORE.test(key)) {continue;}
node = node.get(key);
if (Objects.isNull(node)) {break;}
if (node.isEnd) {
endIndex = i;
if (SIMPLE_MATCH) {break;}
}
}
return endIndex;
}
private static boolean isEmpty(String str) {return str == null || "".equals(str);
}
/**
* 生成残缺的替换符
*
* @param replaceChar 单字符替换符
* @param num 替换数量
* @return 残缺替换符
*/
private static String replacementOf(String replaceChar, int num) {
int minJointLength = 2;
if (num < minJointLength) {return replaceChar;}
StringBuilder replacement = new StringBuilder();
for (int i = 0; i < num; i++) {replacement.append(replaceChar);
}
return replacement.toString();}
/**
* 字典数据节点
*/
private static class Node {
/**
* true:敏感词结尾
*/
boolean isEnd;
/**
* 子节点列表
*/
Map<Character, Node> children;
Node get(Character key) {return Objects.nonNull(children) ? children.get(key) : null;
}
void putChild(Character key, Node node) {if (Objects.isNull(children)) {children = new HashMap<>(16);
}
children.put(key, node);
}
}
/**
* 初始化疏忽字符列表
*/
private static List<Character> ignoreCharListInit() {List<Character> ignoreCharList = new ArrayList<>(10);
ignoreCharList.add('|');
ignoreCharList.add('-');
return Collections.unmodifiableList(ignoreCharList);
}
private SensitiveWordFilters() {}
}
过滤器测试类
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import java.util.Arrays;
class SensitiveWordFiltersTest {
/**
* 重构字典
*/
@Test
void refactoringBy() {SensitiveWordFilters.refactoringBy(Arrays.asList(getSensitiveWords()));
}
/**
* 往字典中加敏感词
*/
@Test
void add() {SensitiveWordFilters.add(Arrays.asList("敏感词"));
}
/**
* 判断内容是否蕴含敏感词
*/
@Test
void foundIn() {SensitiveWordFilters.refactoringBy(Arrays.asList(getSensitiveWords()));
Assertions.assertTrue(SensitiveWordFilters.foundIn("白银混蛋"));
}
/**
* 从内容中找出敏感词
*/
@Test
void findOutFrom() {SensitiveWordFilters.refactoringBy(Arrays.asList(getSensitiveWords()));
System.out.println(SensitiveWordFilters.findOutFrom("白银混蛋"));
}
/**
* 替换内容中的敏感词
*/
@Test
void replace() {SensitiveWordFilters.refactoringBy(Arrays.asList(getSensitiveWords()));
String string = "就算是一个 顶 - 级 高 手,也会被那个白银 混蛋坑得很惨";
System.out.println(SensitiveWordFilters.replace(string, "*"));
}
private static String[] getSensitiveWords() {return sensitiveWords.split("\\|");
}
static final String sensitiveWords = "顶级 | 白银 | 混蛋";
}