import re import string import jieba import jieba.posseg as psg # 加载停用词 def get_stopword_list(): # 停用词表存储路径,每一行为一个词,按行读取进行加载 # 进行编码转换确保匹配准确率 stop_word_path = '/home/work/semantic_platform_DAS/bottom_function/data/HGDstopwords.txt' stopword_list = [sw.replace('\n', '') for sw in open(stop_word_path, encoding='UTF-8').readlines()] return stopword_list def seg_to_list(sentence, pos=False): if not pos: # 不进行词性标注的分词方法 seg_list = jieba.cut(sentence) else: # 进行词性标注的分词方法 seg_list = psg.cut(sentence) return seg_list # 去除干扰词 def word_filter(seg_list, pos=False): stopword_list = get_stopword_list() filter_list = [] # 根据POS参数选择是否词性过滤 ## 不进行词性过滤,则将词性都标记为n,表示全部保留 for seg in seg_list: if not pos: word = seg flag = 'n' else: word = seg.word flag = seg.flag if not flag.startswith('n'): continue # 过滤停用词表中的词,以及长度为<2的词 if not word in stopword_list and len(word) > 2: filter_list.append(word) return filter_list def remove_special_characters(text, pos=False): tokens = seg_to_list(text, pos) filtered_tokens = word_filter(tokens, pos) filtered_text = ' '.join(filtered_tokens) return filtered_text def normalize_corpus(corpus, pos): normalized_corpus = [] for text in corpus: # corrected_sent, detail = pycorrector.correct(text) # print(detail) # text1 = remove_special_characters(corrected_sent, pos) text1 = remove_special_characters(text, pos) normalized_corpus.append(text1) return normalized_corpus