2025-12-06 08:49:44 舰船研究院

敏感词过滤

一、什么是敏感词过滤?

敏感词过滤是一种处理网络内容的技术,可以检测和过滤出网络中的敏感/违禁词汇。它通过给定的关键字或字符串,判断网络内容是否包含某些敏感信息,从而防止违反法律法规的信息流通。

通常,可以使用两种方法来过滤敏感词:

黑名单过滤:即定义一个黑名单,将所有敏感词择记录在其中,然后对输入的文本进行对比,如果发现有敏感词,就将其过滤掉。

白名单过滤:即定义一个白名单,将所有不敏感的词汇记录在其中,然后对输入的文本进行对比,如果发现有不在白名单中的词汇,就将其过滤掉。

二、ToolGood.Words是什么?

ToolGood.Words是一款高性能非法词(敏感词)检测组件,附带繁体简体互换,支持全角半角互换,获取拼音首字母,获取拼音字母,拼音模糊搜索等功能。

ToolGood.Words的源码网站:https://gitcode.net/mirrors/toolgood/ToolGood.Words?utm_source=csdn_github_accelerator

三、在Visual Studio中安装ToolGood.Words

3.1、

一、什么是敏感词过滤?

敏感词过滤是一种处理网络内容的技术,可以检测和过滤出网络中的敏感/违禁词汇。它通过给定的关键字或字符串,判断网络内容是否包含某些敏感信息,从而防止违反法律法规的信息流通。

通常,可以使用两种方法来过滤敏感词:

黑名单过滤:即定义一个黑名单,将所有敏感词择记录在其中,然后对输入的文本进行对比,如果发现有敏感词,就将其过滤掉。

白名单过滤:即定义一个白名单,将所有不敏感的词汇记录在其中,然后对输入的文本进行对比,如果发现有不在白名单中的词汇,就将其过滤掉。

二、ToolGood.Words是什么?

ToolGood.Words是一款高性能非法词(敏感词)检测组件,附带繁体简体互换,支持全角半角互换,获取拼音首字母,获取拼音字母,拼音模糊搜索等功能。

ToolGood.Words的源码网站:https://gitcode.net/mirrors/toolgood/ToolGood.Words?utm_source=csdn_github_accelerator

三、在Visual Studio中安装ToolGood.Words

3.1、Nuget,搜索"ToolGood.Words"并安装:

四、创建"subContentCheck"类

敏感/违禁词汇因特殊内容不便上传,可自行在网站上查找

using Microsoft.AspNetCore.DataProtection.KeyManagement;

using Microsoft.AspNetCore.Http;

using Microsoft.CodeAnalysis.Text;

using Newtonsoft.Json;

using System.Collections;

using System.Text;

using ToolGood.Words;

using static System.Net.Mime.MediaTypeNames;

using IHostingEnvironment = Microsoft.AspNetCore.Hosting.IHostingEnvironment;

namespace WebApplication1 //放在自己项目中时,需要更换为自己的命名空间

{

public class keywords

{

public List IllegalKeywords { get; set; }

}

public class urlwords

{

public List IllegalUrls { get; set; }

}

///

/// 提交的内容敏感违禁词检查类

///

public class subContentCheck

{

///

/// 本地静态文件地址路径

///

private IHostingEnvironment _hostingEnv;

///

/// 敏感词库

///

private string dictionaryPath = "/sensitiveWords/sensitiveWords.txt";

///

/// 敏感链接、网站、网址库

///

private string urlsPath = "/sensitiveWords/IllegalUrls.txt";

///

/// 保存敏感词组

///

public string[] Words { get; set; }

///

/// 一个参数的构造函数

///

/// 本地静态文件地址路径

public subContentCheck(IHostingEnvironment hostingEnv)

{

_hostingEnv = hostingEnv;

InitDictionary();

}

///

/// 初始化内存敏感词库

///

public void InitDictionary()

{

Words = new string[] { };

string wordsPath = _hostingEnv.WebRootPath + dictionaryPath;

string urlPath = _hostingEnv.WebRootPath + urlsPath;

//List keys = new List();

//List urls = new List();

string[] readAllWords = System.IO.File.ReadAllLines(wordsPath, System.Text.Encoding.UTF8);

string[] readAllurl = System.IO.File.ReadAllLines(urlPath, System.Text.Encoding.UTF8);

//由于数组是非动态的,不能进行动态的添加,所有先将它转成list,操作

ArrayList arrayList = new ArrayList(Words.ToList());

if (readAllWords.Length > 0 || readAllurl.Length > 0)

{

if (readAllWords.Length > 1)

{

//keywords key = new keywords();

//key.IllegalKeywords = new List();

foreach (string itemWords in readAllWords)

{

string[] allSplitWords = itemWords.Split('|');

foreach (string itemSplitWords in allSplitWords)

{

if (!string.IsNullOrEmpty(itemSplitWords))

{

arrayList.Add(itemSplitWords);

//string aaa = itemSplitWords;

//key.IllegalKeywords.Add(aaa);

//IllegalKeywords.Add(itemSplitWords);

}

}

}

//keys.Add(key);

}

else

{

if (readAllWords.Length == 1)

{

string[] allSplitWords = readAllWords[0].Split('|');

//keywords key = new keywords();

//key.IllegalKeywords = new List();

foreach (string itemSplitWords in allSplitWords)

{

if (!string.IsNullOrEmpty(itemSplitWords))

{

arrayList.Add(itemSplitWords);

//string aaa = itemSplitWords;

//key.IllegalKeywords.Add(aaa);

//IllegalKeywords.Add(itemSplitWords);

}

}

//keys.Add(key);

}

}

if (readAllurl.Length > 1)

{

//urlwords url = new urlwords();

//url.IllegalUrls = new List();

foreach (string itemUrls in readAllurl)

{

string[] allSplitUrls = itemUrls.Split('|');

foreach (string itemSplitUrls in allSplitUrls)

{

if (!string.IsNullOrEmpty(itemSplitUrls))

{

arrayList.Add(itemSplitUrls);

//string Keyword = itemSplitUrls;

//url.IllegalUrls.Add(Keyword);

//IllegalUrls.Add(itemSplitUrls);

}

}

}

//urls.Add(url);

}

else

{

if (readAllurl.Length == 1)

{

string[] allSplitUrls = readAllurl[0].Split('|');

//urlwords url = new urlwords();

//url.IllegalUrls = new List();

foreach (string itemSplitUrls in allSplitUrls)

{

if (!string.IsNullOrEmpty(itemSplitUrls))

{

arrayList.Add(itemSplitUrls);

//IllegalUrls.Add(itemSplitUrls);

//string Keyword = itemSplitUrls;

//url.IllegalUrls.Add(Keyword);

}

}

//urls.Add(url);

}

}

}

//我们在将list转换成String[]数组

Words = (string[])arrayList.ToArray(typeof(string));

}

///

/// 过滤替换敏感词

///

/// 需要过滤替换的原内容

/// 敏感词替换的字符;默认替换为‘*’

/// 返回状态码;为空则表示传入的内容为空;“0”:设置违禁词时发生错误;“1”:敏感内容替换时发生错误;“2”:需要替换的文本内容为空;其余则返回替换成功的字符串内容

public string FilterWithChar(string sourceText, char replaceChar = '*')

{

if (!string.IsNullOrEmpty(sourceText))

{

string result = "";

WordsSearch wordsSearch = new WordsSearch();

try

{

wordsSearch.SetKeywords(Words);

}

catch (Exception ex)

{

result = "0";

return result;

}

try

{

result = wordsSearch.Replace(sourceText, replaceChar);

return result;

}

catch (Exception ex)

{

return result = "1";

}

}

else

{

return "2";

}

}

///

/// 查找原内容中知否包含敏感/违禁词

///

/// 需要判断的原内容

/// 返回状态码;为空则表示传入的内容为空;“0”:设置违禁词时发生错误;“1”:敏感内容查询时发生错误;“2”:需要替换的文本内容为空;“3”:原内容中包含敏感/违禁词汇;“4”:原内容中不包含敏感/违禁词汇

public string FindSensitiveKey(string sourceText)

{

string result = "";

if (!string.IsNullOrEmpty(sourceText))

{

WordsSearch wordsSearch = new WordsSearch();

try

{

wordsSearch.SetKeywords(Words);

}

catch (Exception ex)

{

result = "0";

return result;

}

try

{

bool res = wordsSearch.ContainsAny(sourceText);

if (res)

{

result = "3";

return result;

}

else

{

result = "4";

return result;

}

}

catch (Exception ex)

{

return result = "1";

}

}

else

{

result = "2";

}

return result;

}

///

/// 把对象写入到json文件中

///

///

///

public static void Write(List jsonData, List urlJsonData, string filename)

{

var directorypath = Directory.GetCurrentDirectory();

string strFileName = directorypath + "\\" + filename + ".json";

string ListJson = "";

if (jsonData != null)

{

ListJson = JsonConvert.SerializeObject(jsonData);

}

else

{

ListJson = JsonConvert.SerializeObject(urlJsonData);

}

Console.WriteLine(ListJson);

writeJsonFile(strFileName, ListJson);

//将序列化的json字符串内容写入Json文件,并且保存

void writeJsonFile(string path, string jsonConents)

{

using (FileStream fs = new FileStream(path, FileMode.OpenOrCreate, System.IO.FileAccess.ReadWrite, FileShare.ReadWrite))

{

//如果json文件中有中文数据,可能会出现乱码的现象,那么需要加上如下代码

Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);

using (StreamWriter sw = new StreamWriter(fs, Encoding.GetEncoding("GB2312")))

{

sw.WriteLine(jsonConents);

}

}

}

}

}

}

五、写API接口

///

/// 进行敏感词脱敏

///

/// 需要脱敏的文本内容

///

[HttpPost]

public IActionResult sensitive_words_replace2(string sourctText)

{

string resultStr = "";

//实例化敏感词库

subContentCheck strCheck = new subContentCheck(_hostingEnv);

if (string.IsNullOrEmpty(sourctText))

{

return Json(new { code = 230, msg = "需要替换的文本内容为空!", resultStr = resultStr });

}

try

{

resultStr = strCheck.FilterWithChar(sourctText);

string resMsg = "";

int resCode = 200;

if (resultStr=="0")

{

resCode = 210;

resultStr = "";

resMsg = "设置违禁词时发生错误,请联系管理员!";

}else if (resultStr=="1")

{

resCode = 240;

resultStr = "";

resMsg = "敏感内容替换时发生错误!";

}

else if (resultStr == "2")

{

resCode = 260;

resultStr = "";

resMsg = "需要替换的文本内容为空!";

}

else

{

resCode = 200;

resMsg = "敏感词替换请求成功!";

}

return Json(new { code = resCode, msg = resMsg, resultStr = resultStr });

}

catch (Exception ex)

{

return Json(new { code = 220, msg = "敏感内容替换时发生错误!", resultStr = "" });

}

}

///

/// 进行敏感词判断

///

/// 需要脱敏的文本内容

///

[HttpPost]

public IActionResult whether_sensitive_words(string sourctText)

{

string resultStr = "";

//实例化敏感词库

subContentCheck strCheck = new subContentCheck(_hostingEnv);

if (string.IsNullOrEmpty(sourctText))

{

return Json(new { code = 230, msg = "需要替换的文本内容为空!", resultStr = resultStr });

}

try

{

resultStr = strCheck.FindSensitiveKey(sourctText);

string resMsg = "";

int resCode = 200;

if (resultStr == "0")

{

resCode = 210;

resultStr = "";

resMsg = "设置违禁词时发生错误,请联系管理员!";

}

else if (resultStr == "1")

{

resCode = 240;

resultStr = "";

resMsg = "敏感内容匹配时发生错误!";

}

else if (resultStr == "2")

{

resCode = 260;

resultStr = "";

resMsg = "需要判断的文本内容为空!";

}

else if (resultStr == "3")

{

resCode = 270;

resultStr = "";

resMsg = "内容中含有敏感/违禁词!";

}

else

{

resCode = 200;

resMsg = "内容中不含敏感/违禁词!";

}

return Json(new { code = resCode, msg = resMsg, resultStr = resultStr });

}

catch (Exception ex)

{

return Json(new { code = 220, msg = "敏感内容匹配时发生错误!", resultStr = "" });

}

}

六、前端封装JS方法

/**

* 敏感词/违禁词替换

* @param {string} sourctText 需要进行替换的内容

* @param {string} boxid 将替换成功之后的内容赋值的元素容器id属性名

* @param {object} layui Layui实例

* @returns 替换之后的文本内容

*/

function sensitive_words_replace(sourctText, boxid, layui) {

let resultStr = "";

//let url = ["/Home/sensitive_words_replace", "/Home/sensitive_words_replace1", "/Home/sensitive_words_replace2"];

$.ajax({

url: "/Home/sensitive_words_replace2",//请求后端接口的路径

dataType: "JSON",

type: "POST",

data: {

"sourctText": sourctText

},

success: function (res) {

let resCode = res.code;

let resMsg = res.msg;

if ((resCode == "210" || resCode == 210) || (resCode == 220 || resCode == "220") || (resCode == 230 || resCode == "230") || (resCode == 240 || resCode == "240") || (resCode == 260 || resCode == "260")) {

//返回数据后关闭loading

layer.closeAll();

resultStr = res.resultStr;

layui.layer.alert(resMsg, { icon: 5, title: "温馨提示", closeBtn: 0 });

} else if (resCode == 200 || resCode == "200") {

resultStr = res.resultStr;

$("#" + boxid).val(resultStr);

//返回数据后关闭loading

layer.closeAll();

}

},

error: function (error) {

//返回数据后关闭loading

layer.closeAll();

layui.layer.alert(error, { icon: 5, title: "温馨提示", closeBtn: 0 });

}

});

return resultStr;

}

/**

* 查询是否包含敏感/违禁词

* @param {string} sourctText 需要进行替换的内容

* @param {string} boxid 将替换成功之后的内容赋值的元素容器id属性名

* @param {object} layui Layui实例

* @returns 返回Bool;包含:“true”;不包含:“false”

*/

function whether_sensitive_words(sourctText, boxid, layui) {

let resultBool = false;

$.ajax({

url: "/Home/whether_sensitive_words",//请求后端接口的路径

dataType: "JSON",

type: "POST",

async: false,//此处需要注意的是要想获取ajax返回的值这个async属性必须设置成同步的,否则获取不到返回值

data: {

"sourctText": sourctText

},

success: function (res) {

let resCode = res.code;

let resMsg = res.msg;

if ((resCode == "210" || resCode == 210) || (resCode == 220 || resCode == "220") || (resCode == 230 || resCode == "230") || (resCode == 240 || resCode == "240") || (resCode == 260 || resCode == "260")) {

resultBool = false;

layui.layer.alert(resMsg, { icon: 5, title: "温馨提示", closeBtn: 0 });

} else if (resCode == 270 || resCode == "270") {

resultBool = true;

} else if (resCode == 200 || resCode == "200") {

resultBool = false;

//返回数据后关闭loading

layer.closeAll();

}

},

error: function (error) {

layui.layer.alert(error, { icon: 5, title: "温馨提示", closeBtn: 0 });

}

});

return resultBool;

}