ancient_poem_check.py 6.68 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
# -*- coding: utf-8 -*-
# @Author: Gree
# @Date:   2021-06-02 13:39:49
# @Last Modified by:   Gree
# @Last Modified time: 2021-06-02 13:48:42


import re
import pandas as pd


class AncientPoemCheck:
    """古诗语料自动化分类的检查"""
    def ancient_poem_check(self, row):
        """
        ancient_poem_check 函数:
        input:
        output: generator
        features: 古诗语料自动化分类的检查
        step1: 检查 ancient_poem 的语料字段 domain、intent、response_text 是否正确 ✅
        """
        # 获取 domain
        domain = row['domain']
        # 获取 query
        query = row['query']
        # 获取 intent
        intent = row['intent']
        # 获取 response_text
        response_text = row['response_text']

        # 异常捕获
        try:
            # 正则表达式匹配数据规律
            query_result = re.search(r'.*?(唐诗|宋词|下一句|陌上桑|诗|李白|杜甫|王维|王之涣|咏鹅|渔歌子|背|岳阳楼记|乡村四月|陶渊明|白居易|鹿柴|杨万里|李清照|静夜思|绝句|千金散尽|上一句|播放古诗|朗诵一首白|春蚕|朗诵|安得广厦千万间|刘禹锡|杨花落尽|野茫茫|观沧海|陋室铭|月是故乡明|桃花源记|锄禾|前不见古人|清平乐|登幽州台歌|小石潭记|江城子|清明时节|采薇|伯牙鼓琴|沁园春|迢迢牵牛星|踏歌行|咏柳|春望|一剪梅|鹅鹅|离骚|赠汪伦|木兰辞|朗读|卖油翁|孟浩然|枫桥夜泊|终南山|黄鹂鸣翠柳|蒹葭|孙权劝学|四时田园|离离原上草|见客棹歌回|天苍苍|晓出净慈寺|游子吟|子夜吴歌|凤凰台|处处闻啼鸟|烟花三月|凉州词|幽人应未眠|高鼎|白云生处|到西洲|白发三千丈|明月几时有|无边落木萧萧下|举头望明月|已亥杂诗|一岁一枯荣|新安吏|玉阶怨|关山月|过零丁洋|归去来兮|芙蓉楼送辛渐|枯藤老树昏鸦|南屏晚钟|小池|空山新雨后|渭城朝雨|早发白帝城|春眠不觉晓|八阵图|七步诗|题破山寺后禅院|送杜少府之任蜀州|青青子衿|雁门太守行|泊秦淮|播放凤求凰|登飞来峰|逢入京使|春夜洛城|夜雨寄北|忆江南|朱自清|无情未必真豪杰|东风不与周郎便|琵琶行|天涯若比邻|粒粒皆辛苦|夕阳无限好|红军不怕远征难|诗经小雅|敕勒歌|相思红豆生南国|浪淘沙|醉卧沙场君|六月二十七日望湖楼醉书|江南春|西江月|回乡偶书|记承天寺夜游|知否知否|卖炭翁|杜牧|回乡偶书|诗经|田园诗|采莲曲|迎春曲|广乐的诗|范仲淹|贺知章|张九龄|秋风词|黄河入海流|白日依山尽|风流天下闻|江上渔者|短歌行|咏梅|满江红|早知潮有信|花间一壶酒|寒蝉凄切|春寒赐浴华清池|牧童遥指杏花村|归园田居|赤壁怀古|又岂在朝朝暮暮|两情若是久长时|雕栏玉砌应犹在|千山鸟飞绝|常记溪亭日暮|北风卷地白草折|金樽清酒斗十千|苏轼|谁道人生无再少|寻寻觅觅|云母屏风烛影深|离别家乡岁月多|造化钟神秀|何当共剪西窗烛|庄生晓梦迷蝴蝶|李煜).*', query)

        except Exception as e:
            print("The error of getting query_result in the module of ancient_poem_check():", e)

        # 异常捕获
        try:
            # 条件判断
            if query_result is not None and '背影' not in query and '想听诗歌' not in query and '背包' not in query and '高英' not in query and 'theme背' not in query and '诗为有' not in query and 'june' not in query and '落花诗' not in query and '闭嘴' not in query and '黑锅' not in query and '占廷' not in query and '收听太' not in query and 'fm三' not in query and '唐诗蝉' not in query and '穿条秋裤回家' not in query and '菊花二' not in query and '倪方六' not in query and '第八代' not in query and '三三原则' not in query and '手淫危害' not in query and '正式参战' not in query and 'a上' not in query and 'raze' not in query and '停止' not in query and query != '秋意秋意诗意' and query != '给我背一个白居易' and query != '来一首绝句':
                row['domain_is_right'] = 'yes'

                if 'search' in intent:
                    row['intent_is_right'] = 'yes'
                else:
                    row['intent_is_right'] = 'no'

                if '作品' in response_text or '来自' in response_text or '全文如下' in response_text:
                    row['response_is_right'] = 'yes'
                elif response_text is None:
                    row['response_is_right'] = 'no'
                else:
                    row['response_is_right'] = 'no'

                # 生成器
                yield {
                    'date_time': row['date_time'],
                    'request_id': row['request_id'],
                    'mac_wifi': row['mac_wifi'],
                    'user_id': row['user_id'],
                    'query': query,
                    'domain': domain,
                    'intent': intent,
                    'response_text': response_text,
                    'domain_is_right': row['domain_is_right'],
                    'intent_is_right': row['intent_is_right'],
                    'response_is_right': row['response_is_right']
                }

            elif '全文' in response_text:
                row['domain_is_right'] = 'yes'
                row['intent_is_right'] = 'yes'
                row['response_is_right'] = 'yes'

                # 生成器
                yield {
                    'date_time': row['date_time'],
                    'request_id': row['request_id'],
                    'mac_wifi': row['mac_wifi'],
                    'user_id': row['user_id'],
                    'query': query,
                    'domain': domain,
                    'intent': intent,
                    'response_text': response_text,
                    'domain_is_right': row['domain_is_right'],
                    'intent_is_right': row['intent_is_right'],
                    'response_is_right': row['response_is_right']
                }

            else:

                # 生成器
                yield {
                    'date_time': row['date_time'],
                    'request_id': row['request_id'],
                    'mac_wifi': row['mac_wifi'],
                    'user_id': row['user_id'],
                    'query': query,
                    'domain': domain,
                    'intent': intent,
                    'response_text': response_text,
                    'domain_is_right': "",
                    'intent_is_right': "",
                    'response_is_right': ""
                }

        except Exception as e:
            print("The error of getting generator in the module of ancient_poem_check():", e)