关于java:java敏感词过滤

106次阅读

共计 12631 个字符,预计需要花费 32 分钟才能阅读完成。

java 敏感词过滤

敏感词:“美元”,“中国”,“北京大学”,“北大”,“南京大学”

DFAUtils

`import java.util.HashMap;
import java.util.LinkedList;
import java.util.Map;

public class DFAUtils {
    /**
     * 增加敏感词到算法树
     */
    public static void addSensitiveWord(String sensitiveWord) {if (null == sensitiveWord || sensitiveWord.length() == 0) {return;}
        char[] chars = sensitiveWord.toCharArray();
        Map<Character, Map> parentMap = sensitiveWordsMap;
        Map<Character, Map> current = null;
        synchronized (lock) {for (int i = 0; i < chars.length; i++) {if (i == 0) {if (sensitiveWordsMap.size() == 0) {
                        /* 增加第一个敏感词的第一个字符执行此 code */
                        if (chars.length == 1) {Map<Character, Map> endMap = new HashMap<>(1);
                            endMap.put(null, null);
                            sensitiveWordsMap.put(chars[0], endMap);
                        } else {sensitiveWordsMap.put(chars[0], null);
                        }
                    } else {current = parentMap.get(chars[0]);
                        if (null == current) {if (chars.length == 1) {Map<Character, Map> endMap = new HashMap<>(1);
                                endMap.put(null, null);
                                sensitiveWordsMap.put(chars[0], endMap);
                                break;
                            } else {sensitiveWordsMap.put(chars[0], null);
                            }
                        } else {if (chars.length == 1) {current.put(null, null);
                                break;
                            }
                        }
                    }
                } else {if (null == current) {Map<Character, Map> childMap = new HashMap<Character, Map>();
                        if (i == chars.length - 1) {Map<Character, Map> endMap = new HashMap<>(1);
                            endMap.put(null, null);
                            childMap.put(chars[i], endMap);
                            parentMap.put(chars[i - 1], childMap);
                            break;
                        } else {childMap.put(chars[i], null);
                            parentMap.put(chars[i - 1], childMap);
                            parentMap = childMap;
                            current = null;
                        }
                    } else {Map<Character, Map> childMap = current.get(chars[i]);
                        if (null == childMap) {if (i == chars.length - 1) {Map<Character, Map> endMap = new HashMap<>(1);
                                endMap.put(null, null);
                                current.put(chars[i], endMap);
                            } else {current.put(chars[i], null);
                                parentMap = current;
                                current = null;
                            }
                        } else {if (i == chars.length - 1) {childMap.put(null, null);
                            } else {
                                parentMap = current;
                                current = childMap;
                            }
                        }
                    }
                }
            }
        }
    }

    /**
     * 查看敏感词 (找到合乎敏感词则返回 -- 单个字符敏感词前后不是中文字符才算敏感词)
     */
    public static String checkSensitiveWord(String content) {if (null == content || content.length() == 0 || sensitiveWordsMap.size() == 0) {return null;}
        char[] chars = content.toCharArray();
        boolean isContain = Boolean.FALSE;
        StringBuilder sbResult = new StringBuilder();
        for (int i = 0; i < chars.length; i++) {if (sensitiveWordsMap.containsKey(chars[i])) {Map<Character, Map> currentMap = sensitiveWordsMap.get(chars[i]);
                sbResult.append(chars[i]);
                if (null == currentMap) {break;} else {if (currentMap.containsKey(null)) {if (sbResult.length() == 1) {
                            /* 前一个字符或后一个字符是否是中文字符 */
                            boolean before = Boolean.FALSE;
                            if (i - 1 < 0) {before = Boolean.TRUE;} else {if (chars[i - 1] < 13312 || chars[i - 1] > 40895) {before = Boolean.TRUE;}
                            }
                            boolean after = Boolean.FALSE;
                            if (i + 1 >= chars.length) {after = Boolean.TRUE;} else {if (chars[i + 1] < 13312 || chars[i + 1] > 40895) {after = Boolean.TRUE;}
                            }
                            if (before && after) {
                                isContain = Boolean.TRUE;
                                break;
                            }
                            /* From 以后 index 开始匹配是否存在敏感词 */
                            int j = i + 1;
                            for (; j < chars.length; j++) {if (currentMap.containsKey(chars[j])) {sbResult.append(chars[j]);
                                    currentMap = currentMap.get(chars[j]);
                                    if (currentMap.containsKey(null)) {
                                        isContain = Boolean.TRUE;
                                        break;
                                    } else {continue;}
                                } else {break;}
                            }
                        } else {
                            isContain = Boolean.TRUE;
                            break;
                        }
                    } else {
                        /* From 以后 index 开始匹配是否存在敏感词 */
                        int j = i + 1;
                        for (; j < chars.length; j++) {if (currentMap.containsKey(chars[j])) {sbResult.append(chars[j]);
                                currentMap = currentMap.get(chars[j]);
                                if (currentMap.containsKey(null)) {
                                    isContain = Boolean.TRUE;
                                    break;
                                } else {continue;}
                            } else {break;}
                        }
                    }
                    if (isContain) {break;} else {sbResult.setLength(0);
                    }
                }
            }
        }

        if (isContain) {return sbResult.toString();
        } else {return null;}
    }

    /**
     * 删除算法树的敏感词
     */
    public static void delSensitiveWord(String sensitiveWord) {if (null == sensitiveWord || sensitiveWord.length() == 0 || sensitiveWordsMap.size() == 0) {return;}
        int delIndex = 0;
        char[] chars = sensitiveWord.toCharArray();
        Map<Character, Map> current = sensitiveWordsMap;
        synchronized (lock) {
            int i = 0;
            for (; i < chars.length; i++) {if (current.containsKey(chars[i])) {if (current.get(chars[i]).size() > 1) {delIndex = i;}

                } else {break;}
                current = current.get(chars[i]);
            }
            if (!current.containsKey(null)) {return;}
            current = sensitiveWordsMap;
            if (i == chars.length) {for (i = 0; i < delIndex; i++) {current = current.get(chars[i]);
                }
                if (i == chars.length) {current.remove(chars[i]);
                } else {if (i == 0 && chars.length == 1) {if (current.get(chars[i]).size() == 1) {current.remove(chars[i]);
                        } else {current.get(chars[i]).remove(null);
                        }
                    } else {if (i + 1 == chars.length) {current.get(chars[i]).remove(null);
                        } else {current.get(chars[i]).remove(chars[i + 1]);
                        }
                    }
                }
            }
        }
    }

    /**
     * 获取算法树的敏感词
     */
    public static LinkedList<String> getSevsitiveWords() {LinkedList<String> listWords = new LinkedList<String>();
        if (sensitiveWordsMap.size() == 0) {return listWords;}
        StringBuilder sbWord = new StringBuilder();
        getSevsitiveWords(sensitiveWordsMap, listWords, sbWord);
        return listWords;
    }

    /**
     * 算法树是否蕴含对应的敏感词
     */
    public static boolean containSensitiveWord(String sensitiveWord) {if (null == sensitiveWord || sensitiveWord.length() == 0 || sensitiveWordsMap.size() == 0) {return false;}
        return sensitiveWord.equals(checkSensitiveWord(sensitiveWord));
    }

    /**
     * 清空算法树
     */
    public static void clearSensitiveWord() {synchronized (lock) {sensitiveWordsMap = new HashMap<Character, Map>();
        }
    }

    /**
     * 递归获取算法树的敏感词
     */
    private static void getSevsitiveWords(Map<Character, Map> childMap, LinkedList<String> listWords,
                                          StringBuilder sbWord) {if (childMap.size() == 1 && childMap.containsKey(null)) {listWords.add(sbWord.toString());
            sbWord.setLength(sbWord.length() - 1);
            return;
        }
        for (Map.Entry<Character, Map> entry : childMap.entrySet()) {Character keyChar = entry.getKey();
            Map<Character, Map> valueMap = entry.getValue();
            if (null == keyChar) {continue;}
            sbWord.append(keyChar);
            if (valueMap.containsKey(null)) {listWords.add(sbWord.toString());
                if (valueMap.size() == 1) {sbWord.setLength(sbWord.length() - 1);
                } else {getSevsitiveWords(valueMap, listWords, sbWord);
                    sbWord.setLength(sbWord.length() - 1);
                }
            } else {getSevsitiveWords(valueMap, listWords, sbWord);
                sbWord.setLength(sbWord.length() - 1);
            }
        }
    }

    private final static Object lock = new Object();
    private static Map<Character, Map> sensitiveWordsMap = new HashMap<Character, Map>();}` 


DFAUtilsTest

外汇名词解释 https://www.fx61.com/definitions

import org.junit.Assert;
import org.junit.Test;

import java.util.LinkedList;

public class DFAUtilsTest {
    /*==========================AddSensitiveWord-start==========================*/
    @Test
    public void testAddSensitiveWord01() {DFAUtils.clearSensitiveWord();
        LinkedList<String> listWords = null;
        DFAUtils.addSensitiveWord("中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(1, listWords.size());
        DFAUtils.addSensitiveWord("中哈");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(2, listWords.size());
        DFAUtils.addSensitiveWord("中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(2, listWords.size());
        DFAUtils.addSensitiveWord("中哈");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(2, listWords.size());
        DFAUtils.delSensitiveWord("中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(1, listWords.size());
        DFAUtils.addSensitiveWord("中中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(2, listWords.size());
        DFAUtils.addSensitiveWord("中中中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(3, listWords.size());
        DFAUtils.addSensitiveWord("人");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(4, listWords.size());
        DFAUtils.addSensitiveWord("中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(5, listWords.size());
    }

    /*==========================AddSensitiveWord-end============================*/

    /*==========================CheckSensitiveWord-start==========================*/
    @Test
    public void testCheckSensitiveWord01() {DFAUtils.clearSensitiveWord();
        String sencitivaWord = null;
        LinkedList<String> listWords = null;
        DFAUtils.addSensitiveWord("大");
        DFAUtils.addSensitiveWord("大学");
        DFAUtils.addSensitiveWord("中中中国中中中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(8, listWords.size());
        sencitivaWord = DFAUtils.checkSensitiveWord("滚");
        Assert.assertEquals("滚", sencitivaWord);

        sencitivaWord = DFAUtils.checkSensitiveWord("翻滚");
        Assert.assertEquals(null, sencitivaWord);

        sencitivaWord = DFAUtils.checkSensitiveWord("滚");
        Assert.assertEquals("滚", sencitivaWord);

        sencitivaWord = DFAUtils.checkSensitiveWord("滚");
        Assert.assertEquals("滚", sencitivaWord);

        sencitivaWord = DFAUtils.checkSensitiveWord("体操");
        Assert.assertEquals(null, sencitivaWord);

        sencitivaWord = DFAUtils.checkSensitiveWord("你好滚滚");
        Assert.assertEquals("滚滚", sencitivaWord);

        sencitivaWord = DFAUtils.checkSensitiveWord("滚你好滚");
        Assert.assertEquals(null, sencitivaWord);

        sencitivaWord = DFAUtils.checkSensitiveWord("滚轮胎");
        Assert.assertEquals(null, sencitivaWord);

        sencitivaWord = DFAUtils.checkSensitiveWord("你你国国");
        Assert.assertEquals(null, sencitivaWord);

        sencitivaWord = DFAUtils.checkSensitiveWord("中中国中中 中中中中国中中中");
        Assert.assertEquals("中中中国中中中", sencitivaWord);
    }

    /*==========================CheckSensitiveWord-start==========================*/

    /*==========================DelSensitiveWor-start==========================*/
    @Test
    public void testDelSensitiveWord01() {DFAUtils.clearSensitiveWord();
        LinkedList<String> listWords = null;
        DFAUtils.addSensitiveWord("中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(1, listWords.size());
        DFAUtils.delSensitiveWord("");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(1, listWords.size());
        DFAUtils.delSensitiveWord("国");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(1, listWords.size());
        DFAUtils.delSensitiveWord("中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(0, listWords.size());
    }

    @Test
    public void testDelSensitiveWord02() {DFAUtils.clearSensitiveWord();
        LinkedList<String> listWords = null;
        DFAUtils.addSensitiveWord("中中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(1, listWords.size());
        DFAUtils.delSensitiveWord("");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(1, listWords.size());
        DFAUtils.delSensitiveWord("中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(1, listWords.size());
        DFAUtils.delSensitiveWord("中中中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(1, listWords.size());
        DFAUtils.delSensitiveWord("中中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(0, listWords.size());
    }

    @Test
    public void testDelSensitiveWord03() {DFAUtils.clearSensitiveWord();
        LinkedList<String> listWords = null;
        DFAUtils.addSensitiveWord("中中");
        DFAUtils.addSensitiveWord("中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(2, listWords.size());
        DFAUtils.delSensitiveWord("");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(2, listWords.size());
        DFAUtils.delSensitiveWord("中中中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(2, listWords.size());
        DFAUtils.delSensitiveWord("中中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(2, listWords.size());
        DFAUtils.delSensitiveWord("中中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(2, listWords.size());
        DFAUtils.delSensitiveWord("中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(1, listWords.size());
        DFAUtils.delSensitiveWord("中中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(0, listWords.size());
        DFAUtils.addSensitiveWord("中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(1, listWords.size());
        DFAUtils.delSensitiveWord("中中中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(1, listWords.size());
        DFAUtils.delSensitiveWord("中中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(1, listWords.size());
        DFAUtils.delSensitiveWord("中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(0, listWords.size());
    }

    @Test
    public void testDelSensitiveWord04() {DFAUtils.clearSensitiveWord();
        LinkedList<String> listWords = null;
        DFAUtils.addSensitiveWord("中中中 111");
        DFAUtils.addSensitiveWord("中中");
        DFAUtils.addSensitiveWord("中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(3, listWords.size());
        DFAUtils.delSensitiveWord("");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(3, listWords.size());
        DFAUtils.delSensitiveWord("中中中 111");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(2, listWords.size());
        DFAUtils.addSensitiveWord("中中中 111");
        DFAUtils.delSensitiveWord("中中中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(3, listWords.size());
        DFAUtils.delSensitiveWord("中中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(2, listWords.size());
        DFAUtils.addSensitiveWord("中中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(3, listWords.size());
        DFAUtils.delSensitiveWord("中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(2, listWords.size());
        DFAUtils.delSensitiveWord("中中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(2, listWords.size());
    }
    /*==========================DelSensitiveWor-end============================*/

    /*==========================ContainSensitiveWord-start==========================*/
    @Test
    public void testContainSensitiveWord01() {DFAUtils.clearSensitiveWord();
        LinkedList<String> listWords = null;
        DFAUtils.addSensitiveWord("滚");
        DFAUtils.addSensitiveWord("中中中国中中中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(7, listWords.size());
        Assert.assertEquals(false, DFAUtils.containSensitiveWord(" "));
        Assert.assertEquals(true, DFAUtils.containSensitiveWord("操"));
    }

    /*==========================ContainSensitiveWord-end============================*/
}

正文完
 0