import time
import re
class Segment:
# 数据成员sentence = ""MaxLen = 0pos = 0len = 0result_MM = "" # 寄存MM分词后果result_RMM = "" # 寄存RMM分词后果final_res = ""dict = []# 构造函数def __init__(self, sentence, MaxLen): self.sentence = sentence self.MaxLen = MaxLen self.pos = 0 self.len = self.MaxLen self.result_MM = "" self.readDict()# 读字典def readDict(self): f = open("chineseDic.txt", "r", encoding="utf-8") lines = f.readlines() for line in lines: # print(line) words = line.split(",") self.dict.append(words[0])# 正向最大匹配def MM(self, nLen, nPos): length = len(self.sentence) if (nPos > length): return substr = self.sentence[nPos:nPos + nLen] if substr in self.dict: self.result_MM = self.result_MM + substr + "/ " nPos = nPos + nLen nLen = self.MaxLen self.MM(nLen, nPos) elif nLen > 1: nLen = nLen - 1 self.MM(nLen, nPos) else: self.result_MM = self.result_MM + substr + "/ " nPos = nPos + 1 nLen = self.MaxLen self.MM(nLen, nPos)# 逆向最大匹配def RMM(self, nLen, nPos): if (nPos < 0): return substr = self.sentence[nPos - nLen:nPos] if substr in self.dict: self.result_RMM = self.result_RMM + "/" + substr nPos = nPos - nLen nLen = self.MaxLen self.RMM(nLen, nPos) elif nLen > 1: nLen = nLen - 1 self.RMM(nLen, nPos) else: self.result_RMM = self.result_RMM + substr + "/" nPos = nPos - 1 nLen = self.MaxLen self.RMM(nLen, nPos)def getMMResult(self): return self.result_MMdef getRMMResult(self): return self.result_RMMdef getFinalResult(self): return self.final_resdef printFinalResult(self): print("正向最大匹配后果:") seg_res_MM = self.result_MM.replace(" ", "") print(seg_res_MM) seg_list_MM = seg_res_MM.split('/') del seg_list_MM[-1] # 因为依照'/'宰割,所以最初会多出一个'',删去 print(seg_list_MM) print("逆向最大匹配后果:") seg_res_RMM = self.result_RMM.replace(" ", "") print(seg_res_RMM) seg_list_RMM = list(reversed(seg_res_RMM.split('/'))) del seg_list_RMM[0] del seg_list_RMM[-1] print(seg_list_RMM) len_MM = len(seg_list_MM) len_RMM = len(seg_list_RMM) flag = 1 for i in range(0, min(len_MM, len_RMM)): if seg_list_MM[i] != seg_list_RMM[i]: print("两次分词后果不统一。") flag = 0 break if (flag): print("两次分词后果统一。") print("最终的分词后果为:") self.final_res = self.result_MM print(self.final_res)
def to_region(segmentation):
region = []start = 1for word in re.compile("\\s+").split(segmentation.strip()): # 空格,回车,换行等空白符 end = start + len(word) - 2 region.append((start, end)) start = end + 1return region
def PRF(target, pred):
t_set, p_set = set(target), set(pred)target_num = len(t_set)pred_num = len(p_set)cap_num = len(t_set & p_set)p = cap_num / pred_numr = cap_num / target_numf = 2 * p * r / (p + r)print("P =", p)print("R =", r)print("F1 =", f)
if name == '__main__':
test_str = '在这一年中,中国的改革开放和现代化建设持续向前迈进。国民经济放弃了“高增长、[利率期货](https://www.gendan5.com/ff/if.html)低通胀”的良好倒退态势。农业生产再次取得好的收成,企业改革持续深入,人民生存进一步改善。对外经济技术单干与交换不断扩大。'seg = Segment(test_str, 3)time_start = time.time()seg.MM(3, 0)seg.RMM(3, len(test_str))time_end = time.time()seg.printFinalResult()print('分词工夫:', time_end - time_start, 's')target_str = "在/ 这/ 一/ 年/ 中/ ,/ 中国/ 的/ 改革/ 凋谢/ 和/ 现代化/ 建设/ 持续/ 向前/ 迈进/ 。/ 国民经济/ 放弃/ 了/ “/ 高/ 增长/ 、/ 低/ 通胀/ ”/ 的/ 良好/ 倒退/ 态势/ 。/ 农业/ 生产/ 再次/ 取得/ 好/ 的/ 收成/ ,/ 企业/ 改革/ 持续/ 深入/ ,/ 人民/ 生存/ 进一步/ 改善/ 。/ 对外/ 经济/ 技术/ 单干/ 与/ 交换/ 一直/ 扩充/ 。/"re_pred = to_region(seg.getFinalResult())re_target = to_region(target_str)# 每个单词按它在文本中的起止地位可记作区间[i, j]print("分词后果:", re_pred)print("标准答案:", re_target)PRF(re_target, re_pred)