normalization.py 1.88 KB
Newer Older
李明杰's avatar
李明杰 committed
1 2 3 4 5 6 7 8 9 10 11
import re
import string
import jieba
import jieba.posseg as psg


# 加载停用词

def get_stopword_list():
    # 停用词表存储路径,每一行为一个词,按行读取进行加载
    # 进行编码转换确保匹配准确率
李明杰's avatar
李明杰 committed
12
    stop_word_path = '/home/work/semantic_platform_DAS/bottom_function/data/HGDstopwords.txt'
李明杰's avatar
李明杰 committed
13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
    stopword_list = [sw.replace('\n', '') for sw in open(stop_word_path, encoding='UTF-8').readlines()]
    return stopword_list


def seg_to_list(sentence, pos=False):
    if not pos:
        # 不进行词性标注的分词方法
        seg_list = jieba.cut(sentence)
    else:
        # 进行词性标注的分词方法
        seg_list = psg.cut(sentence)
    return seg_list


# 去除干扰词
def word_filter(seg_list, pos=False):
    stopword_list = get_stopword_list()
    filter_list = []
    # 根据POS参数选择是否词性过滤
    ## 不进行词性过滤,则将词性都标记为n,表示全部保留
    for seg in seg_list:
        if not pos:
            word = seg
            flag = 'n'
        else:
            word = seg.word
            flag = seg.flag
        if not flag.startswith('n'):
            continue
        # 过滤停用词表中的词,以及长度为<2的词
李明杰's avatar
李明杰 committed
43
        if not word in stopword_list and len(word) > 2:
李明杰's avatar
李明杰 committed
44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
            filter_list.append(word)

    return filter_list


def remove_special_characters(text, pos=False):
    tokens = seg_to_list(text, pos)
    filtered_tokens = word_filter(tokens, pos)
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text


def normalize_corpus(corpus, pos):
    normalized_corpus = []
    for text in corpus:
        # corrected_sent, detail = pycorrector.correct(text)
        # print(detail)
        # text1 = remove_special_characters(corrected_sent, pos)
        text1 = remove_special_characters(text, pos)
        normalized_corpus.append(text1)
    return normalized_corpus