normalization.py 1.83 KB
Newer Older
李明杰's avatar
李明杰 committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
import re
import string
import jieba
import jieba.posseg as psg


# 加载停用词

def get_stopword_list():
    # 停用词表存储路径,每一行为一个词,按行读取进行加载
    # 进行编码转换确保匹配准确率
    stop_word_path = './data/HGDstopwords.txt'
    stopword_list = [sw.replace('\n', '') for sw in open(stop_word_path, encoding='UTF-8').readlines()]
    return stopword_list


def seg_to_list(sentence, pos=False):
    if not pos:
        # 不进行词性标注的分词方法
        seg_list = jieba.cut(sentence)
    else:
        # 进行词性标注的分词方法
        seg_list = psg.cut(sentence)
    return seg_list


# 去除干扰词
def word_filter(seg_list, pos=False):
    stopword_list = get_stopword_list()
    filter_list = []
    # 根据POS参数选择是否词性过滤
    ## 不进行词性过滤,则将词性都标记为n,表示全部保留
    for seg in seg_list:
        if not pos:
            word = seg
            flag = 'n'
        else:
            word = seg.word
            flag = seg.flag
        if not flag.startswith('n'):
            continue
        # 过滤停用词表中的词,以及长度为<2的词
        if not word in stopword_list and len(word) > 0:
            filter_list.append(word)

    return filter_list


def remove_special_characters(text, pos=False):
    tokens = seg_to_list(text, pos)
    filtered_tokens = word_filter(tokens, pos)
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text


def normalize_corpus(corpus, pos):
    normalized_corpus = []
    for text in corpus:
        # corrected_sent, detail = pycorrector.correct(text)
        # print(detail)
        # text1 = remove_special_characters(corrected_sent, pos)
        text1 = remove_special_characters(text, pos)
        normalized_corpus.append(text1)
    return normalized_corpus