diff --git a/autoCheck/airconditioner_check.py b/autoCheck/airconditioner_check.py new file mode 100644 index 0000000000000000000000000000000000000000..f34946ef53a6b85a2d9a509c896f69f0f7dadfc2 --- /dev/null +++ b/autoCheck/airconditioner_check.py @@ -0,0 +1,93 @@ +# -*- coding: utf-8 -*- +# @Author: Gree +# @Date: 2020-12-18 15:13:33 +# @Last Modified by: Gree +# @Last Modified time: 2020-12-18 15:18:22 + + +import re +import pandas as pd + + +def AirconditionerCheck(input_df): + """ + 模块功能:检查airconditioner类的语料字段domain、intent、response_text是否正确 + iterrows: 返回值为元组,(index,row) + """ + print('The module of airconditioner_check is running!') + + for index, row in input_df.iterrows(): + query = row['query'] + domain = row['domain'] + intent = row['intent'] + response_text = row['response_text'] + + try: + """异常捕获""" + # 正则表达式匹配数据规律 + query_result = re.search(r'.*?(空调|模式|制冷|制热|送风|把自动|停止风|打开加热|智能风|为自动|下出风|自清洁|环绕风|开风随|请休息|打开健康|抽湿|休息吧|温度提高5度格力金贝|无风感|祛湿).*', query) + if query_result is not None and '音量' not in query and '海洋风' not in query and '天气' not in query and '加热' not in query and '休息吧' not in query and '关闭空调关闭语音' not in query and query != '空调温度' and query != '把空调温度' and query != '下出风' and query != '格力空调最小音量': + if domain == 'Airconditioner': + if '空调' in query: + row['domain_is_right'] = 'yes' + if 'control' in intent: + row['intent_is_right'] = 'yes' + else: + row['intent_is_right'] = 'no' + + try: + if response_text == '': + row['response_is_right'] = 'yes' + else: + pass + except Exception as e: + print(e) + + else: + row['domain_is_right'] = 'no' + if 'control' in intent: + row['intent_is_right'] = 'yes' + else: + row['intent_is_right'] = 'no' + + try: + if response_text == '': + row['response_is_right'] = 'yes' + else: + pass + + except Exception as e: + print(e) + + # 生成器 + yield { + # 'initial_id': row['id'], + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': row['domain_is_right'], + 'intent_is_right': row['intent_is_right'], + 'response_is_right': row['response_is_right'] + } + + else: + pass + else: + pass + + except Exception as e: + print(e) + + print('The module of airconditioner_check is executed!') + + + + + + + diff --git a/autoCheck/ancient_poem_check.py b/autoCheck/ancient_poem_check.py new file mode 100644 index 0000000000000000000000000000000000000000..474706fd8ecbcc3b6935e7778f5b2bd964b69320 --- /dev/null +++ b/autoCheck/ancient_poem_check.py @@ -0,0 +1,70 @@ +# -*- coding: utf-8 -*- +# @Author: StudentCWZ +# @Date: 2020-11-30 13:31:42 +# @Last Modified by: Gree +# @Last Modified time: 2020-12-18 15:19:15 + + +import re +import pandas as pd + +def AncientPoemCheck(input_df): + """ + 模块功能:检查ancient_poem类的语料字段domain、intent、response_text是否正确 + iterrows: 返回值为元组,(index,row) + """ + print('The module of ancient_poem_check is running!') + + for index, row in input_df.iterrows(): + query = row['query'] + domain = row['domain'] + intent = row['intent'] + response_text = row['response_text'] + + try: + """异常捕获""" + # 正则表达式匹配数据规律 + query_result = re.search(r'.*?(唐诗|宋词|下一句|陌上桑|诗|李白|杜甫|王维|王之涣|咏鹅|渔歌子|背|岳阳楼记|乡村四月|陶渊明|白居易|鹿柴|杨万里|李清照|静夜思|绝句|千金散尽|上一句|播放古诗|朗诵一首白|春蚕|朗诵|安得广厦千万间|刘禹锡|杨花落尽|野茫茫|观沧海|陋室铭|月是故乡明|桃花源记|锄禾|前不见古人|清平乐|登幽州台歌|小石潭记|江城子|清明时节|采薇|伯牙鼓琴|沁园春|迢迢牵牛星|踏歌行|咏柳|春望|一剪梅|鹅鹅|离骚|赠汪伦|木兰辞|朗读|卖油翁|孟浩然|枫桥夜泊|终南山|黄鹂鸣翠柳|蒹葭|孙权劝学|四时田园|离离原上草|见客棹歌回|天苍苍|晓出净慈寺|游子吟|子夜吴歌|凤凰台|处处闻啼鸟|烟花三月|凉州词|幽人应未眠|高鼎|白云生处|到西洲|白发三千丈|明月几时有|无边落木萧萧下|举头望明月|已亥杂诗|一岁一枯荣|新安吏|玉阶怨|关山月|过零丁洋|归去来兮|芙蓉楼送辛渐|枯藤老树昏鸦|南屏晚钟|小池|空山新雨后|渭城朝雨|早发白帝城|春眠不觉晓|八阵图|七步诗|题破山寺后禅院|送杜少府之任蜀州|青青子衿|雁门太守行|泊秦淮|播放凤求凰|登飞来峰|逢入京使|春夜洛城|夜雨寄北|忆江南|朱自清|无情未必真豪杰|东风不与周郎便|琵琶行|天涯若比邻|粒粒皆辛苦|夕阳无限好|红军不怕远征难|诗经小雅|敕勒歌|相思红豆生南国|浪淘沙|醉卧沙场君|六月二十七日望湖楼醉书|江南春|西江月|回乡偶书|记承天寺夜游|知否知否|卖炭翁|杜牧|回乡偶书|诗经|田园诗|采莲曲|迎春曲|广乐的诗|范仲淹|贺知章|张九龄|秋风词|黄河入海流|白日依山尽|风流天下闻|江上渔者|短歌行|咏梅|满江红|早知潮有信).*', query) + if query_result is not None and '背影' not in query and '想听诗歌' not in query and '背包' not in query and '高英' not in query and 'theme背' not in query and '诗为有' not in query and 'june' not in query and '落花诗' not in query and '闭嘴' not in query and '黑锅' not in query and '占廷' not in query and '收听太' not in query and 'fm三' not in query and '唐诗蝉' not in query and '穿条秋裤回家' not in query and '菊花二' not in query and '倪方六' not in query and '第八代' not in query and '三三原则' not in query and '手淫危害' not in query and '正式参战' not in query and 'a上' not in query and 'raze' not in query and '停止' not in query and query != '秋意秋意诗意' and query != '给我背一个白居易' and query != '来一首绝句': + if domain == 'ancient_poem': + row['domain_is_right'] = 'yes' + if 'search' in intent: + row['intent_is_right'] = 'yes' + else: + row['intent_is_right'] = 'no' + + + try: + if '作品' in response_text or '来自' in response_text: + row['response_is_right'] = 'yes' + elif response_text is None: + row['response_is_right'] = 'no' + else: + row['response_is_right'] = 'no' + except: + pass + + yield { + # 'initial_id': row['id'], + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': row['domain_is_right'], + 'intent_is_right': row['intent_is_right'], + 'response_is_right': row['response_is_right'] + } + + else: + pass + + else: + pass + except: + pass + + print('The module of ancient_poem_check is executed!') diff --git a/autoCheck/chat_check.py b/autoCheck/chat_check.py new file mode 100644 index 0000000000000000000000000000000000000000..16a7410b0a580cede65a347508a3170e638e0491 --- /dev/null +++ b/autoCheck/chat_check.py @@ -0,0 +1,95 @@ +# -*- coding: utf-8 -*- +# @Author: StudentCWZ +# @Date: 2020-12-14 14:39:36 +# @Last Modified by: Gree +# @Last Modified time: 2020-12-18 15:20:21 + + +import re +import pandas as pd + + +def ChatCheck(input_df): + """ + 模块功能:检查sports类的语料字段domain、intent、response_text是否正确 + iterrows: 返回值为元组,(index,row) + """ + print('The module of chat_check is running!') + + for index, row in input_df.iterrows(): + query = row['query'] + domain = row['domain'] + intent = row['intent'] + response_text = row['response_text'] + + + try: + if domain == 'chat': + if ('温度升' in query or '最大风' in query or '强风' in query or '风小点' in query or '调到低速' in query or '切换到最小温度' in query or '温度放到30度' in query or '设为30度' in query or '高点高风' in query or '风量' in query or '降低20' in query or '调到28度' in query or '风速调到最大' in query or '温度调大到' in query or '风速调到中' in query or '温度调到' in query or '温度达到' in query or '调低低档' in query or '温度升高' in query or '温度调' in query or '调大低档' in query or '关机关闭' in query or '低档调到最低' in query or '调大到30度' in query or '调高中风档' in query or '风速设为自动风' in query or '调大到中风档' in query) and '空调' not in query: + row['domain_is_right'] = 'no' + row['intent_is_right'] = 'no' + row['response_is_right'] = 'no' + elif ('换风模式' in query or '上下扫风' in query or '辅热模式' in query or '制热' in query or '热风' in query or '定向扫风' in query or '空调模式' in query or '制冷' in query or '暖空调' in query or '采暖的' in query or '热空调' in query or '暖风' in query or '打开空调' in query or '关闭空调' in query or '暖风' in query or '空调温度' in query or '空调风档' in query or '空调风速设' in query or '格力空调有点冷' in query or '自动风速格力空调' in query or '降低20格力空调' in query or '关掉空调' in query or '关闭格力空调' in query or '格力空调关闭' in query or '将开空调' in query or '最小风量空调' in query) and '新闻' not in query and '空调' in query and query != '格力空调模式': + row['domain_is_right'] = 'no' + row['intent_is_right'] = 'no' + row['response_is_right'] = 'no' + elif ('丑小鸭' in query or '安徒生' in query or '三国演义' in query or '电台' in query) and '几点钟' not in query: + row['domain_is_right'] = 'no' + row['intent_is_right'] = 'no' + row['response_is_right'] = 'no' + elif ('乘以' in query or '除以' in query) and '歌行' not in query: + row['domain_is_right'] = 'no' + row['intent_is_right'] = 'no' + row['response_is_right'] = 'no' + elif ('唐诗' in query or '古诗' in query or '白居易' in query or '陌上桑' in query or '李煜' in query or '诗歌' in query or '杜甫' in query or '乐府诗' in query) and '自动风' not in query and '最小' not in query and '国庆节' not in query: + row['domain_is_right'] = 'no' + row['intent_is_right'] = 'no' + row['response_is_right'] = 'no' + elif ('平安夜' in query or '是万圣节' in query or '十二月二十六日是' in query or '一月二十五号是' in query or '十二月二十二日是' in query or '南瓜节' in query or '10月31日万圣节' in query) and query != '平安夜部' and query != '平安夜吗' and query != '不是是平安夜': + row['domain_is_right'] = 'no' + row['intent_is_right'] = 'no' + row['response_is_right'] = 'no' + elif '天气' in query and '新闻' not in query: + row['domain_is_right'] = 'no' + row['intent_is_right'] = 'no' + row['response_is_right'] = 'no' + elif ('热点新' in query or '新闻联播' in query or '打开头条' in query) and '音量' not in query and '模式' not in query and '天气' not in query: + row['domain_is_right'] = 'no' + row['intent_is_right'] = 'no' + row['response_is_right'] = 'no' + elif '刘德华' in query or '点一首' in query or '广东不下雪' in query or '塔斯肯的铃声' in query or '小兔子乖乖' in query or '如果高兴你就拍拍手' in query or '笑起来真好看' in query or '不过人间' in query or '漂洋过海来看你' in query or '江南style' in query or '桥边的姑娘' in query or '海来阿木' in query or '百鸟朝凤' in query or '谢谢你的爱' in query or '可可托海的牧羊人' in query: + row['domain_is_right'] = 'no' + row['intent_is_right'] = 'no' + row['response_is_right'] = 'no' + elif ('关闭刚才播放' in query or '切换到薛之谦' in query or '关闭音乐' in query or '打开酷狗' in query or '银河奥特曼' in query or '锦衣之下' in query or '音乐也大一点' in query or '单曲循环' in query or '关闭语音' in query or '最小音量' in query or '音乐调小' in query or '皮卡丘' in query or '春光灿烂猪八戒' in query or '赛罗奥特曼格斗' in query or '萌鸡小队' in query or '三嫁惹君心' in query or '蜘蛛侠' in query or '播放梦幻奇缘' in query or '播放甲午中日战争' in query or '小猴子爬山' in query or '火星情报局' in query or '播放留言' in query or '播放音乐' in query or '声音关小一点' in query or '关掉音乐' in query or '请播交响乐' in query or '音乐放小点' in query or '音乐调大' in query or '关闭相声' in query) and '12月26' not in query and '几日' not in query and '新闻' not in query: + row['domain_is_right'] = 'no' + row['intent_is_right'] = 'no' + row['response_is_right'] = 'no' + else: + pass + + if row['domain_is_right'] == 'no': + yield { + # 'initial_id': row['id'], + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': row['domain_is_right'], + 'intent_is_right': row['intent_is_right'], + 'response_is_right': row['response_is_right'] + } + else: + pass + else: + pass + + except: + pass + + print('The module of chat_check is executed!') + diff --git a/autoCheck/conn_sql.py b/autoCheck/conn_sql.py new file mode 100644 index 0000000000000000000000000000000000000000..a0d41583d6fa431a738d49de193c94779c1541e9 --- /dev/null +++ b/autoCheck/conn_sql.py @@ -0,0 +1,237 @@ +# -*- coding: utf-8 -*- +# @Author: Gree +# @Date: 2020-11-30 09:44:06 +# @Last Modified by: StudentCWZ +# @Last Modified time: 2020-12-26 08:54:04 + + +import pymysql +import configparser +from datetime import datetime +import re + + +class ConnSql(object): + """ + 1. 构建一个连接数据库的父类:读取配置文件,连接数据库。 + """ + def conn_sql(self): + """ + 1. 读取 sql.conf 配置文件,并利用相关信息,连接数据库 + 2. 返回值:conn 对象。 + """ + cf = configparser.ConfigParser() + + cf.read("sql.conf") # 读取 sql.conf 配置文件(完全路径) + + conf_dict = { + "host": str(cf["log_on"]["host"]), + "port": int(cf["log_on"]["port"]), + "user": str(cf["log_on"]["user"]), + "passwd": str(cf["log_on"]["passwd"]), + "db": str(cf["log_on"]["db"]) + } + + # 异常捕获 + try: + # 连接数据库 + conn = pymysql.connect( + host = conf_dict["host"], + port = conf_dict["port"], + user = conf_dict["user"], + passwd = conf_dict["passwd"], + db = conf_dict["db"] + ) + + print("Database connection is successful!") + return conn + + except Exception as e: + print(e) + + +class DbRun(ConnSql): + """ + 1. 对于父类 ConnSql 的继承,继承了父类的属性和方法,并拥有自己的方法。 + 2. DbRun 类主要进行数据库的操作。 + 3. initial_data 方法:获取要自动标注的原始数据,如果输入的时间段不为空,则获取相应时间段的原始数据; + 如果输入时间段为空,则获取所有时间段的原始数据。 + """ + def table_exists(self, table_name): + """ + 1. 该方法用来判断我们所要的表格是否存在 + """ + print("Loading the module of table_exists ...") + + conn = self.conn_sql() # 调用父类方法获取 conn 对象 + # 进行异常捕获 + try: + with conn.cursor() as cursor: + sql = "show tables;" + cursor.execute(sql) + tables = [cursor.fetchall()] + table_list = re.findall('(\'.*?\')', str(tables)) + table_list = [re.sub("'",'',each) for each in table_list] + if table_name in table_list: + print("The table of %s is exists!" % table_name) + return 1 # 存在返回1 + else: + print("The table of %s is not exists!" % table_name) + return 0 + + except Exception as e: + print(e) + + finally: + conn.close() + + def new_table(self, table_name): + """ + 1. 在 mysql 新建一张数据表 + """ + conn = self.conn_sql() # 调用父类方法获取 conn 对象 + + # 进行异常捕获 + try: + """ + 1. sql 语句新建一个所需要字段的 data sheet 。 + """ + with conn.cursor() as cursor: + sql = "CREATE TABLE %s (`id` INT PRIMARY KEY AUTO_INCREMENT,`date_time` datetime DEFAULT NULL,`request_id` varchar(50) DEFAULT NULL,`mac_wifi` varchar(50) DEFAULT NULL,`user_id` varchar(50) DEFAULT NULL,`query` varchar(255) DEFAULT NULL,`domain` varchar(50) DEFAULT NULL,`intent` varchar(50) DEFAULT NULL,`response_text` text DEFAULT NULL,`domain_is_right` varchar(50) DEFAULT NULL,`intent_is_right` varchar(50) DEFAULT NULL,`response_is_right` varchar(50) DEFAULT NULL) CHARSET=utf8;" % table_name + + cursor.execute(sql) + conn.commit() # 事务的手动提交 + print("Data sheet of %s is established!" % table_name) + + except Exception as e: + print(e) + + finally: + conn.close() + + def initial_data(self, table_name, time_before = '', time_now = ''): + """ + 1. 获取需要进行自动化标注的最原始数据。(其中 time_before, time_now 表示一个时间段。) + """ + print("Loading the module of initial_data ...") + + conn = self.conn_sql() # 调用父类方法获取 conn 对象 + + # 进行异常捕获 + try: + """如果时间段为空,则表示获取数据表 ctoc_tb 中相关字段的全部数据""" + if time_before == '': + with conn.cursor() as cursor: + sql = "select date_time, request_id, mac_wifi, user_id, query, domain, intent, response_text from %s" % table_name + cursor.execute(sql) + result = cursor.fetchall() + print("Get initial data successfully!") + return result + + else: + """如果时间段不为空,则表示获取数据表 ctoc_tb 中相关字段在一个时间段的数据""" + with conn.cursor() as cursor: + sql = "select date_time, request_id, mac_wifi, user_id, query, domain, intent, response_text from %s where date_time BETWEEN %s and %s" % (table_name, time_before, time_now) + cursor.execute(sql) + result = cursor.fetchall() + print("Get initial data between %s and %s" % (time_before, time_now)) + return result + + except Exception as e: + print(e) + + finally: + conn.close() + # print("Get initial data successfully!") + + def contrast_data(self, table_name): + """ + 1. 该方法用于获取数据库中目前已经存在的正确分类数据。 + 2. 方法目的:后面会将原始数据每一条数据的 query 字段在正确分类数据中进行遍历查找,如果相等,则该条数据不处理; + 如果不相等,则该条数据进入自动化标注环节,进行自动化标注处理。 + """ + print("Loading the module of contrast_data ...") + + conn = self.conn_sql() # 调用父类方法获取 conn 对象 + + # 进行异常捕获 + try: + with conn.cursor() as cursor: + sql = "select date_time, query from %s" % table_name + cursor.execute(sql) + result = cursor.fetchall() + print("Get contrast data successfully!") + return result + + except Exception as e: + print(e) + + finally: + conn.close() + # print("Get contrast data successfully!") + + def insert_data(self, table_name, df): + """ + 1. 该方法将自动化分类的最终数据插入相应的数据表中。 + """ + + conn = self.conn_sql() # 调用父类方法获取 conn 对象 + + sql_lst = [] # 获取插入 sql 语句的各个字段,用于批量插入数据 + + # 进行异常捕获 + try: + """ + 1. sql 语句插入相对应字段的数据。 + """ + with conn.cursor() as cursor: + sql = "INSERT INTO " + table_name + " (date_time, request_id, mac_wifi, user_id, query, domain, intent, response_text, domain_is_right, intent_is_right, response_is_right) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" + # sql = "INSERT INTO final_info (date_time, request_id, mac_wifi, user_id, query, domain, intent, response_text, domain_is_right, intent_is_right, response_is_right) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" + + for i in range(0, len(df)): + # print("Data is inserting:" + str(i + 1)) + time = datetime.strftime(df.iloc[i, 0], "%Y-%m-%d %H:%M:%S") # 将时间类型转为字符串 + sql_truple = (time, df.iloc[i, 1], df.iloc[i, 2], df.iloc[i, 3], df.iloc[i, 4], df.iloc[i, 5], df.iloc[i, 6], df.iloc[i, 7], df.iloc[i, 8], df.iloc[i, 9], df.iloc[i, 10]) + sql_lst.append(sql_truple) + # cursor.execute(sql, (time, df.iloc[i, 1], df.iloc[i, 2], df.iloc[i, 3], df.iloc[i, 4], df.iloc[i, 5], df.iloc[i, 6], df.iloc[i, 7], df.iloc[i, 8], df.iloc[i, 9], df.iloc[i, 10])) + # conn.commit() # 事务的手动提交 + print("Data is inserting...") + cursor.executemany(sql, sql_lst) # 批量插入数据 + conn.commit() # 事务的手动提交 + + except Exception as e: + print(e) + + finally: + conn.close() + print('The inserting of data is finished!') + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/autoCheck/data_deal.py b/autoCheck/data_deal.py new file mode 100644 index 0000000000000000000000000000000000000000..b2854dc7bba7906d7d8d8460cefd6aa4fe9af1f2 --- /dev/null +++ b/autoCheck/data_deal.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- +# @Author: Gree +# @Date: 2020-12-17 15:57:05 +# @Last Modified by: Gree +# @Last Modified time: 2020-12-25 19:05:51 + +import get_time as gt +import conn_sql as cs +import pandas as pd + + +class DataDeal(object): + """ + 1. DataDeal 类主要用于获取所需要操作的数据框:原始数据框 + """ + + def initial_df(self, initial_data): + print("Loading the module of initial_df ...") + try: + df_result = pd.DataFrame(list(initial_data), columns=["date_time", "request_id", "mac_wifi", "user_id", "query", "domain", "intent", "response_text"]) + df_result['domain_is_right'] = '' # 新增domain_is_right列 + df_result['intent_is_right'] = '' # 新增intent_is_right列 + df_result['response_is_right'] = '' # 新增response_is_right列 + df_result = df_result.sort_values(["date_time"], ascending=False) + df_result = df_result.drop_duplicates(subset = "query") # query列数据去重 + df_result = df_result.sort_values(["date_time"], ascending=True) + # print(df_result.head(5)) + print("The dimension of initial dataframe: ", end = "") + print(df_result.shape) # 输出当前数据框的维度 + return df_result + + except Exception as e: + print(e) + + + diff --git a/autoCheck/duplicate_check.py b/autoCheck/duplicate_check.py new file mode 100644 index 0000000000000000000000000000000000000000..775848c8e5a1061f37dda15f03eff1393cc8ad0e --- /dev/null +++ b/autoCheck/duplicate_check.py @@ -0,0 +1,85 @@ +# -*- coding: utf-8 -*- +# @Author: Gree +# @Date: 2020-12-17 16:03:28 +# @Last Modified by: StudentCWZ +# @Last Modified time: 2020-12-26 08:59:41 + + +import conn_sql as cs +import pandas as pd +import data_deal as dd + + +class DuplicateCheck(object): + """ + 1. DuplicateCheck 类主要是为了获取原始数据的query字段与目前已经入库的且正确分类数据 query 列进行对比。 + 2. 如果相等,该条数据在原始数据中被删除,如果不等,则保留。 + 3. 该类可以避免相同的 query 字段数据进入自动化分类标注模块。 + """ + + def __init__(self): + self.query_lst = [] + + + def duplicate_check(self, contrast_data, initial_df): + """ + 1. 获取对比数据集,利用 query 字段进行去重对比。 + """ + print("Loading the module of duplicate_check ...") + try: + df_duplicate = pd.DataFrame(list(contrast_data), columns=['datetime', 'query']) + df_duplicate = df_duplicate.drop_duplicates(subset = 'query') # query 列数据去重 + + + for index, row in df_duplicate.iterrows(): + self.query_lst.append(row['query']) + + # print(len(self.query_lst)) + + except Exception as e: + print(e) + + + + + try: + for index, row in initial_df.iterrows(): + if row['query'] in self.query_lst: + initial_df.drop(index=index) + else: + pass + + input_df = initial_df + + print('The dimension of input dataframe: ', end='') + print(input_df.shape) # 输出当前数据框的维度 + print("Get input data successfully!") + + return input_df + + + + except Exception as e: + print(e) + + + + + + + + + + + + + + + + + + + + + + diff --git a/autoCheck/fm_check.py b/autoCheck/fm_check.py new file mode 100644 index 0000000000000000000000000000000000000000..26ff83bf2e165d577d4cc46f7611abb980a5c414 --- /dev/null +++ b/autoCheck/fm_check.py @@ -0,0 +1,69 @@ +# -*- coding: utf-8 -*- +# @Author: StudentCWZ +# @Date: 2020-11-30 12:27:58 +# @Last Modified by: Gree +# @Last Modified time: 2020-12-18 15:23:29 + + + +import re +import pandas as pd + +def FmCheck(input_df): + """ + 模块功能:检查fm类的语料字段domain、intent、response_text是否正确 + iterrows: 返回值为元组,(index,row) + """ + print('The module of fm_check is running!') + + for index, row in input_df.iterrows(): + query = row['query'] + domain = row['domain'] + intent = row['intent'] + response_text = row['response_text'] + + try: + # 正则表达式匹配数据规律 + query_result = re.search(r'.*?(故事|白雪公主的|白雪公主讲|评书|小品|相声|京剧|美文之声|广播|笑林|黑猫警长|奥特曼|葫芦娃|fm|星辰变|童话格林|卖火柴|下一个三字经|水浒|频道|西游记|西厢记|小说|主播|个台|下一集|下一段|下一章|下一张|之声|收听|兆|FM|格林童话|戏曲|三字经|冰雪奇缘|京戏|郭德纲|集|丑小鸭|阿拉丁|米小圈|萌鸡小队|虾球传|经济杂谈|调频|亿万老婆买一送一|梁祝|袁阔成|三国演义|小猪佩奇|黄梅戏|电台|冬吴同学会|猪猪侠|游园惊梦|睡前故事|电视剧|青华浮梦|安徒生|二人转|下一节|脱口秀|海底小纵队).*', query) + if query_result is not None and '关' not in query and '体操' not in query and '爱情故事' not in query and '建成了' not in query and '丰田' not in query and '配置' not in query and '研究' not in query and '走私' not in query and '黑锅' not in query and '奇异' not in query and '会议' not in query and '贞操' not in query and '你妹啊' not in query and '音量' not in query and '语音' not in query and '停止' not in query and query != '不听故事' and query != '唱的故事' and query != '请找私有制的故事' and query != '播放来的故事' and query != '应该剪个两只小老虎的故事': + if domain == 'fm': + row['domain_is_right'] = 'yes' + if intent is not None: + row['intent_is_right'] = 'yes' + else: + row['intent_is_right'] = 'no' + + try: + if response_text is None or '抱歉' in response_text or '绘本' in response_text or '推荐那些事' in response_text or '跳泥坑' in response_text or '叮当头条' in response_text or '保持独立' in response_text or '来读诗' in response_text or '来听听男一号吧' in response_text or '推荐你听听最近很火的湖北传统采茶戏' in response_text or '魔镜魔镜告诉我' in response_text: + row['response_is_right'] = 'no' + else: + row['response_is_right'] = 'yes' + except: + pass + + + yield { + # 'initial_id': row['id'], + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': row['domain_is_right'], + 'intent_is_right': row['intent_is_right'], + 'response_is_right': row['response_is_right'] + } + + else: + pass + + else: + pass + except: + pass + + print('The module of fm_check is executed!') + diff --git a/autoCheck/get_time.py b/autoCheck/get_time.py new file mode 100644 index 0000000000000000000000000000000000000000..c502ecb2d09c0fc4c4b5b2d6ab27a1a1d91b4d4d --- /dev/null +++ b/autoCheck/get_time.py @@ -0,0 +1,81 @@ +# -*- coding: utf-8 -*- +# @Author: Gree +# @Date: 2020-12-17 13:39:53 +# @Last Modified by: Gree +# @Last Modified time: 2020-12-22 08:26:27 + +import datetime + + +class GetTime(object): + """ + 1. 该类主要用于得到目前时间,然后获取一周前、一个月前的时间点,根据前时间节点到现今的时间节点时间段, + 再通过 conn_sql.py 中 DbRun 类中的方法获取原始数据。 + """ + def __init__(self): + self.time_now_timestamp = datetime.datetime.now() + self.time_mid_timestamp = self.time_now_timestamp - datetime.timedelta(days=250) + + def get_time(self, days=0): + """ + 1. 该方法主要用于获取所需要的前时间节点,比如一个星期之前,或者一个月之前,形参默认值都为0。 + 2. 如果该方法不传参数,采用默认值,则 time_before 和 time_now 都为空字符串。 + """ + print("Loading the module of get_time ...") + # 捕获异常 + try: + if days == 0: + time_before = '' + time_now = '' + + else: + time_before_timestamp = self.time_now_timestamp - datetime.timedelta(days=days) + # time_before_timestamp = self.time_mid_timestamp - datetime.timedelta(days=days) + time_now = self.time_now_timestamp.strftime("%Y-%m-%d %H:%M:%S") + # time_now = self.time_mid_timestamp.strftime("%Y-%m-%d %H:%M:%S") + time_before = time_before_timestamp.strftime("%Y-%m-%d %H:%M:%S") + time_now = "\'" + time_now + "\'" + time_before = "\'" + time_before + "\'" + + print("The information of time is gained!") + + return (time_before, time_now) + + except Exception as e: + print(e) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/autoCheck/global_control_check.py b/autoCheck/global_control_check.py new file mode 100644 index 0000000000000000000000000000000000000000..9f81b99b05e1e27b424f7d45f8ad99d0349a72f5 --- /dev/null +++ b/autoCheck/global_control_check.py @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- +# @Author: StudentCWZ +# @Date: 2020-12-14 15:17:40 +# @Last Modified by: Gree +# @Last Modified time: 2020-12-18 15:26:04 + + +import re +import pandas as pd + + +def GlobalControlCheck(input_df): + """ + 模块功能:检查sports类的语料字段domain、intent、response_text是否正确 + iterrows: 返回值为元组,(index,row) + """ + print('The module of global_control_check is running!') + + for index, row in input_df.iterrows(): + query = row['query'] + domain = row['domain'] + intent = row['intent'] + response_text = row['response_text'] + + + try: + # 正则表达式匹配数据规律 + if domain == 'globalctrl': + if ('停止播放' in query or '关闭新闻' in query or '关掉新闻' in query or '关新闻' in query or '给我播放' in query or '音乐声音' in query or '音乐' in query or '暂停播放' in query or '关闭静音' in query or '声音关掉' in query or '大声点' in query or '调小一点' in query or '音小点' in query or '声音小一点' in query or '小声点' in query or '加大一点' in query or '声音不大' in query or '放大声点' in query or '调小点' in query or '增大音' in query or '放大' in query or '开到最小' in query or '音量' in query or '声音关' in query or '声音轻点' in query or '大点声' in query or '不听了' in query or '关闭语音' in query or '音低点' in query or '声音调' in query or '声音响' in query or '关小声' in query or '音小一点' in query or '声音大一点' in query or '声音再大' in query or '放小声音' in query or '声音大些' in query or '小点声儿' in query or '关闭声音' in query) and '打开军' not in query and '温度调到' not in query and '严肃处理' not in query and '在放手' not in query and '你还在讲' not in query and '学校' not in query and '入竹' not in query and '性交' not in query and '爆料' not in query and '关机' not in query and '饮料' not in query and '6哲' not in query and '最小风' not in query and '一线城市' not in query and '听不懂我讲话' not in query and '买' not in query and '小乖' not in query and '礼尚往来' not in query and '什么美食' not in query and '毕业' not in query and '小美' not in query and '小麦' not in query and '花儿也谢了' not in query and '打开9档' not in query and '解冻' not in query and '小白' not in query and '选择wifi' not in query and '小伙子' not in query and '新闻综合' not in query and '诗' not in query and '关系到' not in query and '度' not in query and '一月' not in query and '国家' not in query and '美' not in query and '妹' not in query and '管家' not in query and '小微' not in query and '反应' not in query and '祖国' not in query and '油烟机' not in query and '爸' not in query and '叶月' not in query and '不是我不听了' not in query and '系统安全' not in query and '耕地' not in query and '我感觉你很大' not in query and '协议' not in query and '南瓜' not in query and '万家' not in query and '扫风' not in query and '狗' not in query and '风' not in query and '窗帘' not in query and '清洁' not in query and '寡人' not in query and '亲历' not in query and '阴蒂' not in query and '固始县' not in query and '霸业' not in query and '颈椎' not in query and '埃及' not in query and '一加一' not in query: + row['domain_is_right'] = 'no' + row['intent_is_right'] = 'no' + row['response_is_right'] = 'no' + + yield { + # 'initial_id': row['id'], + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': row['domain_is_right'], + 'intent_is_right': row['intent_is_right'], + 'response_is_right': row['response_is_right'] + } + + else: + pass + else: + pass + except: + pass + + + print('The module of global_control_check is executed!') + + + + + + + + diff --git a/autoCheck/holiday_check.py b/autoCheck/holiday_check.py new file mode 100644 index 0000000000000000000000000000000000000000..84a3be8677e0e654407de511eb275421d82eb27e --- /dev/null +++ b/autoCheck/holiday_check.py @@ -0,0 +1,69 @@ +# -*- coding: utf-8 -*- +# @Author: StudentCWZ +# @Date: 2020-11-30 12:39:08 +# @Last Modified by: StudentCWZ +# @Last Modified time: 2020-12-18 15:28:36 + + +import re +import pandas as pd + + +def HolidayCheck(input_df): + """ + 模块功能:检查holiday类的语料字段domain、intent、response_text是否正确 + iterrows: 返回值为元组,(index,row) + """ + print('The module of holiday_check is running!') + + for index, row in input_df.iterrows(): + query = row['query'] + domain = row['domain'] + intent = row['intent'] + response_text = row['response_text'] + + try: + # 正则表达式匹配数据规律 + query_result = re.search(r'.*?(多少点|几点|几日|几月|农历|现在时间|北京时间|什么时间|多少天|几天|礼拜|阳历|节日|[0-9]月|多少号|现在的时间|多长时间|今天星期几|今天几号|是初几|周几|什么时候|到过年|春节|圣诞节|还有多久过年|星期几|平安夜|国庆节|什么日子|过年|除夕|什么时候|看一下时间|父亲节|教师节|建党节|差几天|周几|父亲节|多久放假|几号|哪一天|哪天是|儿童节|元旦|劳动|清明节|元宵节|情人节|腊八节|重阳节|看一下时间|端午节|妇女节|中秋节|过年|愚人节|号还有多久|植树节|放几天|秋分|万圣节|母亲节|几时几分|一月|国际禁毒日|冬至|明天的|什么节).*', query) + if query_result is not None and '几点起' not in query and '肯德基' not in query and '温度' not in query and '22度' not in query and '汇率' not in query and '剑杰' not in query and '新闻' not in query and '除以' not in query and '来一首' not in query and '技能' not in query and '张北北的歌' not in query and '停止' not in query and '音量' not in query and '现在北京12月' not in query and query != '一月二十' and query != '安阳16日是什么日子' and query != '春节有多少天' and query != '明天的' and query != '冬至是哪' and query != '冬至有多少天' and query != '12月22日' and query != '一月二十九号' and query != '还有多少天帮我查一查' and query != '1月25号' and query != '四日是平安夜12月24日是平安夜吗': + if domain == 'holiday': + row['domain_is_right'] = 'yes' + if 'search' in intent: + row['intent_is_right'] = 'yes' + else: + row['intent_is_right'] = 'no' + + try: + if response_text is not None and '我暂时' not in response_text and '春节是春节' not in response_text: + row['response_is_right'] = 'yes' + else: + row['response_is_right'] = 'no' + except: + pass + + + yield { + # 'initial_id': row['id'], + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': row['domain_is_right'], + 'intent_is_right': row['intent_is_right'], + 'response_is_right': row['response_is_right'] + } + + else: + pass + + else: + pass + except: + pass + + print('The module of holiday_check is executed!') + diff --git a/autoCheck/main.py b/autoCheck/main.py new file mode 100644 index 0000000000000000000000000000000000000000..f7d6e90901ab5ed7b2a1e0cbe34133a6b181ec3f --- /dev/null +++ b/autoCheck/main.py @@ -0,0 +1,174 @@ +# -*- coding: utf-8 -*- +# @Author: Gree +# @Date: 2020-12-17 15:42:46 +# @Last Modified by: StudentCWZ +# @Last Modified time: 2020-12-31 08:44:57 + +import pandas as pd +import configparser +import conn_sql as cs +import get_time as gt +import duplicate_check as dc +import data_deal as dd +import airconditioner_check as ac +import play_control_check as pcc +import universal_control_check as ucc +import weather_check as wc +import fm_check as fc +import holiday_check as hc +import music_check as mc +import news_check as nc +import ancient_poem_check as apc +import science_check as sc +import sports_check as spc +import stocks_check as stc +import translate_check as tc +import chat_check as cc +import global_control_check as gcc + + + + +def IntegrateData(df): + """ + 1. IntergrateData函数用来整合每个自动化分类检查的函数返回的item。 + 2. 获取整合数据后的DataFrame。 + """ + result_lst = [] # 新建一个空列表用来接收自动化分类检查各个类别返回的生成器中的item。 + + for item in ac.AirconditionerCheck(df): + + result_lst.append(item) + + for item in apc.AncientPoemCheck(df): + result_lst.append(item) + + for item in fc.FmCheck(df): + result_lst.append(item) + + for item in hc.HolidayCheck(df): + result_lst.append(item) + + for item in mc.MusicCheck(df): + result_lst.append(item) + + for item in nc.NewsCheck(df): + result_lst.append(item) + + for item in pcc.PlayControlCheck(df): + result_lst.append(item) + + for item in sc.ScienceCheck(df): + result_lst.append(item) + + for item in spc.SportsCheck(df): + result_lst.append(item) + + for item in stc.StockCheck(df): + result_lst.append(item) + + for item in tc.TranslateCheck(df): + result_lst.append(item) + + for item in ucc.UniversalControlCheck(df): + result_lst.append(item) + + for item in wc.WeatherCheck(df): + result_lst.append(item) + + for item in gcc.GlobalControlCheck(df): + result_lst.append(item) + + df = pd.DataFrame(result_lst, columns=['date_time', 'request_id', 'mac_wifi', 'user_id', 'query', 'domain', 'intent', 'response_text', 'domain_is_right', 'intent_is_right', 'response_is_right']) + df = df.astype(object).where(pd.notnull(df), '') + df = df.sort_values(["date_time"], ascending=True) + + print('The dimension of final dataframe: ', end='') + print(df.shape) # 输出当前数据框的维度 + return df + + +def main(): + GetTime = gt.GetTime() # 实例化时间操作类 + time_before, time_now = GetTime.get_time(days=1) + + DbRun = cs.DbRun() # 实例化数据库操作类 + + """ + cf = configparser.ConfigParser() + cf.read("sql.conf") # 读取 sql.conf 配置文件 + + initial_table = str(cf.get("table_names", "initial_table")) # 获取 sql.conf 中的 table_names 中的信息 + insert_table = str(cf.get("table_names", "insert_table")) # 获取 sql.conf 中的 table_names 中的信息 + """ + + initial_table = "ctoc_tb" # 原始数据表 + insert_table = "final_info" # 要插入数据表 + + # print("Get table_names of sql.conf") # 测试 + + table_exists_value = DbRun.table_exists(initial_table) # 判断 initial_table 是否存在,如果 initial_table 存在,则返回1,否则返回0。 + + if table_exists_value == 1: + initial_data = DbRun.initial_data(initial_table, time_before, time_now) # 如果 initial_table 存在,获取 initial_table 中相关字段的信息。 + else: + print("Error: Get initial_data!") # 如果 initial_table 不存在,获取 initial_table 中相关字段的信息失败。 + + DataDeal = dd.DataDeal() # 实例化一个数据框操作类 + initial_df = DataDeal.initial_df(initial_data) # 将获取到的 initial_data 数据进行数据框操作,得到原始的 initial_df 数据框。 + + table_exists_value = DbRun.table_exists(insert_table) # 判断 insert_table 是否存在,如果 insert_table 存在,则返回1,否则返回0。 insert_table 表是用来插入最终数据。 + + if table_exists_value == 1: + contrast_data = DbRun.contrast_data(insert_table) # 如果 insert_table 存在,获取 insert_table 中相关字段的信息。 + output_df = IntegrateData(initial_df) # 将原始的 initial_df 数据框中的数据进行自动化分类检查,返回一个 output_df 数据框。 + DbRun.insert_data(insert_table, output_df) # 将 output_df 数据框中数据批量插入 insert_table 。 + + else: + DbRun.new_table(insert_table) # 如果 insert_table 不存在,则新建 insert_table 。 + output_df = IntegrateData(initial_df) + DbRun.insert_data(insert_table, output_df) + + # DuplicateCheck = dc.DuplicateCheck() + # input_df = DuplicateCheck.duplicate_check(contrast_data, initial_df) + + # output_df = IntegrateData(initial_df) + # DbRun.insert_data(output_df) + + +def job(): + """ + 1. 设置定时任务 + """ + main() # 运行主函数 + + +if __name__ == '__main__': + main() + + """ + schedule.every(1).days.do(job) # 设置定时区间 + while True: + schedule.run_pending() + sec = schedule.idle_seconds() + time.sleep(sec) + """ + + + + + + + + + + + + + + + + + + + diff --git a/autoCheck/music_check.py b/autoCheck/music_check.py new file mode 100644 index 0000000000000000000000000000000000000000..679bc195e918e22a76ae8650989cfd62ae2d7ee2 --- /dev/null +++ b/autoCheck/music_check.py @@ -0,0 +1,75 @@ +# -*- coding: utf-8 -*- +# @Author: StudentCWZ +# @Date: 2020-11-30 12:48:07 +# @Last Modified by: Gree +# @Last Modified time: 2020-12-18 15:33:04 + + +import re +import pandas as pd + +def MusicCheck(input_df): + """ + 模块功能:检查music类的语料字段domain、intent、response_text是否正确 + iterrows: 返回值为元组,(index,row) + """ + print('The module of music_check is running!') + + for index, row in input_df.iterrows(): + query = row['query'] + domain = row['domain'] + intent = row['intent'] + response_text = row['response_text'] + # reclassification = row['reclassification'] + + + try: + # 正则表达式匹配数据规律 + query_result = re.search(r'.*?(播放|音乐|摇滚乐|一首|歌|刘德华|点首|给我放|歌曲|周杰伦|想听|李健|伍佰|张学友|来首|酒醉的蝴蝶|陈奕迅|郑丽媛|要听|降央|不爱我就别伤害我|放|筷子兄弟|林俊杰|林忆莲|唱|邓丽君|韩宝仪|换首|放一首爱不停歇|奢香夫人|音乐海浪|陈百强|草原绿了|下载原谅|小苹果|她的背影|放那个音乐|放最近|小小的太阳|等你等了那么久|英文歌|换慢歌|爱的罗曼史|你的风景|我的唇吻不到我爱的人|换首少年|泉水叮咚|大悲咒|祝你生日快乐|后来遇见他|我的梦英文版|刀郎|我想父亲的草原|周华健|痴心换情深|来生再去拥抱你|薛之谦的怪咖|爱你一万年|高胜美的缘|周思涵|远走高飞|周深的|我想听下雨天|再见只是陌生人|寂寞是你给的苦|一个人挺好|桥边姑娘|帮你首情人|朴彩英|小手拍拍|播放一点|没有你陪伴真的好孤单|讲不出再见|夏天的风|天空之城|我们都一样|播放冷漠的|来一首莫文蔚|张冬玲的牛在飞|忘情牛肉面|花桥流水|百花香|孟婆的碗|相思的债|一生回味一面|射雕英雄传插曲|后来遇见他|播放最近流行歌曲|黄凯芹的晚秋|韩磊的|播放古筝版权御天下|野狼disco|陈粒的走马|随便来一曲|后生仔|惜别的海岸|让我欢喜让我忧|你偷走了我的心|陈冠希的战争|唱歌不忘阶级苦|唱亲爱的你在哪里|毛不易的春边|古朗月行|毛不易的歌|宿命传说主题曲|孙露的|whatarewords|海草舞|播放一年级|再回到从前|站在草原望北京|阿果吉曲|笑看风云|海阔天空|王晓天的荣耀|华晨宇唱的哪吒|情缘等足一辈子|播放bigbang的loser|黄玫瑰|自由飞翔|不变的情缘|两个人的回忆一个人过|学猫叫|恋人心|记得忘记|郑智化的水手|放那个老妹你真美|播放大哲|放一首下定决心忘记你|千与千寻|你是我的妞|无法忘记你|手心里的温柔|外婆的澎湖湾|放小阿枫的歌|你的样子|落花诗图|小英雄大肚腩|爱在记忆中找你|白龙马|止战之殇|王杰的|放非酋|不变的情缘|来一曲这条街|山水组合|达拉崩|听一个人|换五月天的歌|古典音乐|把音乐打开|安静一点的音乐|放最近|小小的太阳|请播项羽|小老鼠上灯台|这首音乐叫什么|我想听下雨天|宝贝宝贝|林烁的惊雷|听抖音|莫文蔚|播放小螺号|播放最近流行歌曲|给我播放大王叫我来巡山|爱我就抱抱我|唱歌不忘阶级苦|你是人间四月天|你给我听听吗|邓紫棋的泡沫|潇洒走一回|一曲红尘|你笑起来真好看|放一首桃花运|帮我放小星星|跟你聊天就是想听听你的声音|播放山水组合|放那个音乐|一曲相思|丢了你|冷漠的|三水组合|琵琶语|小手拍拍|来曲春天|萨克斯|播放大哲|田一名的李莫愁|播放小白兔白又白|放小阿枫的歌|落花诗图|爱我不要丢下我|我的天空|帮好听的歌|梦回云南|千里之外|忘情水|把酒倒满|龙卷风|淋雨一直走|萨日朗|等你三千年|借酒浇愁|罗大佑的恋|死心塌地|甜蜜蜜|最炫民族风|林中的鸟|大河向东流|门丽|萨顶顶|张惠妹|大王叫我来巡山|刘三姐|想你的时候问月亮|大约在冬季|韩红|野花香|告白气球|醉赤壁|平凡之路|主题曲|兰花草|男儿当自强|帝女花|无人之岛|小城故事|赵雷|陶哲|陪你去流浪|万水千山总是情|恋曲|快乐崇拜|炸山姑娘|张碧晨|青藏高原|阿杜|黑鸭子|冬天里的一把火|两只老虎|爱我你就抱抱我|林宥嘉|陈蓉晖|叶倩文|虫儿飞|腾格尔|亲爱的你在哪里|王菲|隐形的翅膀|一剪梅|邓紫棋友谊地久天长|张国荣|当爱已成往事|风继续吹|草蜢的失恋阵线联盟|夫妻双双把家还|这条街|爱情这杯酒谁喝都得醉|蒋雪儿|小白兔白又白|张韶涵最新单曲|爱江山更爱美人|人的一生|月牙湾|将近酒|小鸡小鸡|青木林|邓紫棋|播数鸭子|江南style|每一个明天|万水千山|那就这样吧|春秋|小鸭子|我和我的祖国|来曲送亲|爱的路上千万里|另一种乡愁|来个孙露|点中文榜|请珍惜|李良|徐小凤|世上只有妈妈好|咖喱咖喱|听我说谢谢你|小蝌蚪找妈妈|我最亲爱的|忘川彼岸|无言的结局|山谷里的思念|霸王别姬|必杀技|刘文正|人间四月天|普通DISCO|李克勤|黑猫警长|我的祖国|听个小芳|风吹麦浪|么么哒|听少年|你到底爱谁|生日快乐|听万玲琳|迷人的危险|lostrivers|李昕融|你是我的唯一|陶晶莹|谢谢你因为有你|小白兔乖乖|郭峰|小阿枫|你的答案|独角戏|我把真心给了你|姜育恒|今生相爱|友谊地久天长|刘若英|光良的童话|错的时间遇见对的你|拥抱你离去|永不失联的爱|陪你一起去草原|爱江山更爱美人|我的快乐就是想你|命运交响曲|蓝色的多瑙河|北国之春|思念情缘|万爱千恩|播个龙王|itsok|张宇|最美的伤口|石头剪刀布|一曲回家|听和兰花在一起|职迷不误|梦中的婚礼|许巍|一起走过的日子|姑娘我爱你|就是爱你|我要你|海来阿木的|廖健|高天上流云|群星的乐曲|兄弟想你了|刘和刚|茉莉花|高山流水|好日子|记得咱的家|明天会更好|秋裤大叔|神奇的九寨|阿里阿里|小燕子|泥娃娃|兔子舞|理查德克莱德曼|黑龙的感谢|点燃一根烟|我的好妈妈|小跳蛙|小兔子乖乖|社会摇|葫芦娃|红马鞍|搞笑漓江曲).*', query) + if query_result is not None and '关闭' not in query and '别唱' not in query and '音乐关掉' not in query and '德云社' not in query and '音乐小镇点' not in query and '取消音乐' not in query and '音乐关' not in query and '恋恋不忘' not in query and '唐诗' not in query and '歌给关' not in query and '静夜思' not in query and '音乐增' not in query and query != '播放儿童' and query != '请播放话题' and query != '播放莫言' and query != '我要听童话故' and query != '唱onei千恩万2000' and query != '懂得珍惜才配拥有歌曲' and query != '点一首真的爱你' and query != '播放赛罗奥特曼' and query != '点一首那个' and query != '来一首幸福的爱' and query != '不唱歌啦' and query != '我要听慢歌' and query != '给我放山楂树之恋' and query != '帮我播慢歌' and query != '我想听可口可乐还能不能愉' and query != '音乐85' and query != '播放肖娜' and query != '播放没志气' and query != '我想听毛下' and query != '林宥嘉少女' and query != '播放卡拉鸡' and query != '播放半是蜜糖半是伤' and query != '请播放一首唱古文' and query != '播放双笙的我的一个道姑朋友' and query != '放下沙' and query != '我想听赛罗奥特曼': + if domain == 'music': + row['domain_is_right'] = 'yes' + if 'play' in intent or 'pause' in intent or 'next' in intent or 'search' in intent or 'choose' in intent or 'add' in intent: + row['intent_is_right'] = 'yes' + else: + row['intent_is_right'] = 'no' + + try: + response_result = re.search(r'.*?(一首|听|欣赏|播放|送给你|推荐|歌|翻到|开始|好的|曲库|一起|马上为|找到).*', response_text) + if response_result is not None and '抱歉' not in response_text and '人类的语言真是太复杂了' not in response_text and 'AankhenKhuli' not in response_text and '大大泡泡糖' not in response_text and '格力和你一起听奶茶' not in response_text and '是不是薛之谦的我害怕' not in response_text: + row['response_is_right'] = 'yes' + else: + row['response_is_right'] = 'no' + + except: + pass + + + yield { + # 'initial_id': row['id'], + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': row['domain_is_right'], + 'intent_is_right': row['intent_is_right'], + 'response_is_right': row['response_is_right'] + } + + else: + pass + else: + pass + except: + pass + + + print('The module of music_check is executed!') + + + + diff --git a/autoCheck/news_check.py b/autoCheck/news_check.py new file mode 100644 index 0000000000000000000000000000000000000000..a21bc50445dffb0a8d104367c7d7fad8b3be83d7 --- /dev/null +++ b/autoCheck/news_check.py @@ -0,0 +1,69 @@ +# -*- coding: utf-8 -*- +# @Author: StudentCWZ +# @Date: 2020-11-30 12:53:14 +# @Last Modified by: Gree +# @Last Modified time: 2020-12-18 15:34:03 + + +import re +import pandas as pd + +def NewsCheck(input_df): + """ + 模块功能:检查news类的语料字段domain、intent、response_text是否正确 + iterrows: 返回值为元组,(index,row) + """ + print('The module of news_check is running!') + + for index, row in input_df.iterrows(): + query = row['query'] + domain = row['domain'] + intent = row['intent'] + response_text = row['response_text'] + + + try: + # 正则表达式匹配数据规律 + query_result = re.search(r'.*?(新闻|今日头条).*', query) + if query_result is not None and '等于' not in query and '周杰伦' not in query and '制冷' not in query and query != '我要听最新' and query != '打开新闻' and query != '我要听热门' and query != '格力空调读新闻': + if domain == 'news': + row['domain_is_right'] = 'yes' + if 'search' in intent: + row['intent_is_right'] = 'yes' + else: + row['intent_is_right'] = 'no' + + try: + if response_text is not None: + row['response_is_right'] = 'yes' + else: + row['response_is_right'] = 'no' + + except: + pass + + + yield { + # 'initial_id': row['id'], + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': row['domain_is_right'], + 'intent_is_right': row['intent_is_right'], + 'response_is_right': row['response_is_right'] + } + + else: + pass + else: + pass + except: + pass + + print('The module of news_check is executed!') + diff --git a/autoCheck/play_control_check.py b/autoCheck/play_control_check.py new file mode 100644 index 0000000000000000000000000000000000000000..b07c8cfc56ade4e71ea5ac75e5abc9f8ea48e633 --- /dev/null +++ b/autoCheck/play_control_check.py @@ -0,0 +1,70 @@ +# -*- coding: utf-8 -*- +# @Author: StudentCWZ +# @Date: 2020-11-30 11:35:55 +# @Last Modified by: Gree +# @Last Modified time: 2020-12-18 15:35:37 + + + +import re +import pandas as pd + + +def PlayControlCheck(input_df): + """ + 模块功能:检查playcontrol类的语料字段domain、intent、response_text是否正确 + iterrows: 返回值为元组,(index,row) + """ + print('The module of play_control_check is running!') + + for index, row in input_df.iterrows(): + query = row['query'] + domain = row['domain'] + intent = row['intent'] + response_text = row['response_text'] + + + try: + # 正则表达式匹配数据规律 + query_result = re.search(r'.*?(关闭播放|关闭音乐|停止音乐|换一|下一首|别唱了|音乐关|暂停|下一曲|换歌|继续播放|换音乐|关掉音乐|上一曲|不想听|关闭歌曲|不要放|换首歌|继续唱|请切|单曲循环|关闭故事|停止播放|关掉歌曲|歌曲关掉|切音乐|上一首歌|继续的|退出音乐|音乐停止|请播放下|播放上一首|换首音乐|下一个|把停止|这首歌是|换个歌|继续播|是什么歌|请播放|播放下|关音乐|关掉关掉音乐|打开音乐|请关了音乐|不听歌了|请关关闭|把歌声关掉|关闭关闭|把音乐打开|不听了|关了音乐|取消歌曲|调最小音乐|关闭儿歌|退出播放|停止放音乐|关掉故事|继续音乐|重新播放|换个音乐|接着播放|切儿歌|继续|换首|不要音乐|歌曲关闭|歌名|这首歌|关掉歌|退出歌曲|换个音乐|换儿歌|故事关了|下一集|上一首|播放个仔停止|换个儿歌|关闭歌|停止我要听|播放音乐|停止放歌|哪个朝代的|这诗谁写的|作者是谁).*', query) + if query_result is not None and '等于' not in query and '乘' not in query and '最小风' not in query and '有点冷' not in query and '查一查' not in query and '切换到一' not in query and '切换到二' not in query and '停止我要听童话故事' not in query and '停止播放我要听故事' not in query and '关闭语音暂停播放' not in query and '几点钟' not in query and '这诗' not in query: + if domain == 'PlayControl' and '帮我放下' not in query and '帮我播放下' not in query and '请播放下歌' not in query and query != '是哪个朝代的' and query != '作者是谁' and query != '这首歌叫什么名字': + row['domain_is_right'] = 'yes' + if 'control' in intent or 'query' in intent: + row['intent_is_right'] = 'yes' + else: + row['intent_is_right'] = 'no' + + try: + if '学艺不精' not in response_text: + row['response_is_right'] = 'yes' + else: + row['response_is_right'] = 'no' + except: + pass + + + yield { + # 'initial_id': row['id'], + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': row['domain_is_right'], + 'intent_is_right': row['intent_is_right'], + 'response_is_right': row['response_is_right'] + } + + else: + pass + else: + pass + + except: + pass + + print('The module of play_control_check is executed!') diff --git a/autoCheck/requirement.txt b/autoCheck/requirement.txt new file mode 100644 index 0000000000000000000000000000000000000000..d4df7f8e24edf8bd62a5d9bc4710eb020fc489f0 --- /dev/null +++ b/autoCheck/requirement.txt @@ -0,0 +1,60 @@ +absl-py==0.9.0 +APScheduler==3.6.3 +astor==0.8.1 +certifi==2020.4.5.1 +cffi==1.14.0 +chardet==3.0.4 +configparser==4.0.2 +cryptography==2.9.2 +DBUtils==1.3 +Django==2.1 +django-apscheduler==0.3.0 +django-cors-headers==3.2.1 +django-database-pool==0.0.1 +elasticsearch==7.1.0 +gast==0.2.2 +google-pasta==0.2.0 +grpcio==1.30.0 +h5py==2.10.0 +idna==2.9 +importlib-metadata==1.6.1 +jieba==0.42.1 +joblib==0.15.1 +jsonformatter==0.2.3 +Keras==2.3.1 +Keras-Applications==1.0.8 +Keras-Preprocessing==1.1.2 +Markdown==3.2.2 +mock==4.0.2 +mysqlclient==1.4.6 +numpy==1.16.0 +opt-einsum==3.2.1 +pandas==1.0.1 +protobuf==3.12.2 +public==2019.4.13 +pycparser==2.20 +PyJWT==1.7.1 +PyMySQL==0.9.3 +python-consul==1.1.0 +python-dateutil==2.8.1 +pytz==2019.3 +PyYAML==5.3.1 +query-string==2019.4.13 +requests==2.23.0 +schedule==0.6.0 +scikit-learn==0.23.1 +scipy==1.5.0 +six==1.14.0 +sklearn==0.0 +sqlparse==0.3.0 +tensorboard==1.13.1 +tensorflow==1.13.1 +tensorflow-estimator==1.13.0 +termcolor==1.1.0 +threadpoolctl==2.1.0 +tzlocal==2.0.0 +urllib3==1.25.7 +Werkzeug==1.0.1 +wrapt==1.12.1 +xlrd==1.2.0 +zipp==3.1.0 diff --git a/autoCheck/science_check.py b/autoCheck/science_check.py new file mode 100644 index 0000000000000000000000000000000000000000..54e719006549ee63bdcef890f17e2286340d380f --- /dev/null +++ b/autoCheck/science_check.py @@ -0,0 +1,69 @@ +# -*- coding: utf-8 -*- +# @Author: StudentCWZ +# @Date: 2020-11-30 13:39:18 +# @Last Modified by: Gree +# @Last Modified time: 2020-12-18 15:37:08 + + + +import re +import pandas as pd + + +def ScienceCheck(input_df): + """ + 模块功能:检查science类的语料字段domain、intent、response_text是否正确 + iterrows: 返回值为元组,(index,row) + """ + print('The module of science_check is running!') + + for index, row in input_df.iterrows(): + query = row['query'] + domain = row['domain'] + intent = row['intent'] + response_text = row['response_text'] + + try: + # 正则表达式匹配数据规律 + query_result = re.search(r'.*?(加|减|乘|除|等于|次方|光年|换算|长度).*', query) + if query_result is not None and '站在' not in query: + if domain == 'science': + row['domain_is_right'] = 'yes' + if 'calculator' in intent or 'unit_' in intent: + row['intent_is_right'] = 'yes' + else: + row['intent_is_right'] = 'no' + + try: + if ('等于' in response_text or '单位' in response_text or '做不出来' in response_text) and '24乘以50384乘以59等于' not in query: + row['response_is_right'] = 'yes' + else: + row['response_is_right'] = 'no' + except: + pass + + + yield { + # 'initial_id': row['id'], + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': row['domain_is_right'], + 'intent_is_right': row['intent_is_right'], + 'response_is_right': row['response_is_right'] + } + + else: + pass + + else: + pass + except: + pass + + print('The module of science_check is executed!') diff --git a/autoCheck/sports_check.py b/autoCheck/sports_check.py new file mode 100644 index 0000000000000000000000000000000000000000..b55ebf1e8f07cc72219b6e989bf1fd38f317d354 --- /dev/null +++ b/autoCheck/sports_check.py @@ -0,0 +1,73 @@ +# -*- coding: utf-8 -*- +# @Author: StudentCWZ +# @Date: 2020-11-30 13:45:49 +# @Last Modified by: StudentCWZ +# @Last Modified time: 2020-12-18 15:38:20 + + +import re +import pandas as pd + + +def SportsCheck(input_df): + """ + 模块功能:检查sports类的语料字段domain、intent、response_text是否正确 + iterrows: 返回值为元组,(index,row) + """ + print('The module of sports_check is running!') + + for index, row in input_df.iterrows(): + query = row['query'] + domain = row['domain'] + intent = row['intent'] + response_text = row['response_text'] + + try: + # 正则表达式匹配数据规律 + query_result = re.search(r'.*?(赛事|比赛|nba|cba|中超|詹姆斯|科比|乔丹|国安|英超|切尔西|对阵|阿森纳|塞尔利亚人|冠军|足球|湖人|决赛|巴斯坦人|广州恒大|女排|火箭|联赛|皇马|梅西|雷霆|拜仁|曼联|奥运会|森林狼|凯尔特人|世界杯|欧洲杯|公牛队|活塞队|老鹰|步行者|尤文图斯|赢了|比分|阿斯特拉|多少分|杜兰特|篮网|热火|猛龙|灰熊|欧联杯|骑士|热刺|对手|皇家马德里|队|巴萨|美洲杯|利物浦|vs|ac米兰).*', query) + if query_result is not None: + if domain == 'sports': + row['domain_is_right'] = 'yes' + if 'search' in intent: + row['intent_is_right'] = 'yes' + else: + row['reply_is_right'] = 'no' + + try: + if response_text is not None: + row['response_is_right'] = 'yes' + else: + row['response_is_right'] = 'no' + except: + pass + + + yield { + # 'initial_id': row['id'], + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': row['domain_is_right'], + 'intent_is_right': row['intent_is_right'], + 'response_is_right': row['response_is_right'] + } + + else: + pass + + else: + pass + except: + pass + + + + print('The module of sports_check is executed!') + + + diff --git a/autoCheck/sql.conf b/autoCheck/sql.conf new file mode 100644 index 0000000000000000000000000000000000000000..078b9b61e78ac92f43d8a1fcc94cb06de2042aae --- /dev/null +++ b/autoCheck/sql.conf @@ -0,0 +1,10 @@ +[log_on] +host = 172.28.5.39 +port = 3306 +user = test +passwd = qwe!23 +db = semantic_data_analyze + + + + diff --git a/autoCheck/stocks_check.py b/autoCheck/stocks_check.py new file mode 100644 index 0000000000000000000000000000000000000000..de41aba1844ad258b0e9f56acf43175a8ce65526 --- /dev/null +++ b/autoCheck/stocks_check.py @@ -0,0 +1,72 @@ +# -*- coding: utf-8 -*- +# @Author: StudentCWZ +# @Date: 2020-11-30 13:50:57 +# @Last Modified by: Gree +# @Last Modified time: 2020-12-18 15:41:10 + + +import re +import pandas as pd + + +def StockCheck(input_df): + """ + 模块功能:检查stocks类的语料字段domain、intent、response_text是否正确 + iterrows: 返回值为元组,(index,row) + """ + print('The module of stock_check is running!') + + for index, row in input_df.iterrows(): + query = row['query'] + domain = row['domain'] + intent = row['intent'] + response_text = row['response_text'] + + try: + # 正则表达式匹配数据规律 + query_result = re.search(r'.*?(股|行情|上证|换手率|市盈率|成交|市值|指数|大盘|走势|科创板|收盘).*', query) + if query_result is not None and '隔壁的' not in query and '天天唠' not in query and '墙头' not in query and '张文斌' not in query and 'Dubbing' not in query and '革命' not in query and '小编' not in query and '一个瓶子' not in query and '安静' not in query and '股份有限公司' not in query and '退出' not in query and '为了炒' not in query and '少佐' not in query and '乔飞' not in query and '一般' not in query: + if domain == 'stock': + row['domain_is_right'] = 'yes' + if intent is not None: + row['intent_is_right'] = 'yes' + else: + row['reply_is_right'] = 'no' + + try: + if response_text is not None and 'S.H.E' not in query: + row['response_is_right'] = 'yes' + else: + row['response_is_right'] = 'no' + + except: + pass + + + yield { + # 'initial_id': row['id'], + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': row['domain_is_right'], + 'intent_is_right': row['intent_is_right'], + 'response_is_right': row['response_is_right'] + } + + else: + pass + + else: + pass + except: + pass + + + print('The module of stock_check is executed!') + + diff --git a/autoCheck/translate_check.py b/autoCheck/translate_check.py new file mode 100644 index 0000000000000000000000000000000000000000..2b43e32667d67dcc8d88a60835ebdb13984b6e69 --- /dev/null +++ b/autoCheck/translate_check.py @@ -0,0 +1,69 @@ +# -*- coding: utf-8 -*- +# @Author: StudentCWZ +# @Date: 2020-11-30 13:53:29 +# @Last Modified by: Gree +# @Last Modified time: 2020-12-18 15:42:24 + + + +import re +import pandas as pd + + +def TranslateCheck(input_df): + """ + 模块功能:检查translate类的语料字段domain、intent、response_text是否正确 + iterrows: 返回值为元组,(index,row) + """ + print('The module of translate_check is running!') + + for index, row in input_df.iterrows(): + query = row['query'] + domain = row['domain'] + intent = row['intent'] + response_text = row['response_text'] + + try: + # 正则表达式匹配数据规律 + query_result = re.search(r'.*?(翻译|英文怎么说|英语怎么说|什么意思|怎么说|怎么拼写|英文|中文|英语).*', query) + if query_result is not None and '英语介绍' not in query and '英文介绍' not in query and '傻逼' not in query and '闭嘴' not in query and '小逼崽子' not in query and '操我' not in query and query != '听英文' and query != '你会英文吗' and query != '七六中文怎么说': + if domain == 'translate': + row['domain_is_right'] = 'yes' + if 'translate' in intent: + row['intent_is_right'] = 'yes' + else: + row['reply_is_right'] = 'no' + + try: + if response_text is not None and '1' not in response_text and '2' not in response_text and '5' not in response_text and '7' not in response_text and '8' not in response_text and response_text != 'Ah': + row['response_is_right'] = 'yes' + else: + row['response_is_right'] = 'no' + except: + pass + + + yield { + # 'initial_id': row['id'], + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': row['domain_is_right'], + 'intent_is_right': row['intent_is_right'], + 'response_is_right': row['response_is_right'] + } + + else: + pass + + else: + pass + except: + pass + + print('The module of translate_check is executed!') diff --git a/autoCheck/universal_control_check.py b/autoCheck/universal_control_check.py new file mode 100644 index 0000000000000000000000000000000000000000..f33eaf68a3b4e49889f3f6673798548593e659e9 --- /dev/null +++ b/autoCheck/universal_control_check.py @@ -0,0 +1,67 @@ +# -*- coding: utf-8 -*- +# @Author: StudentCWZ +# @Date: 2020-11-30 12:14:55 +# @Last Modified by: Gree +# @Last Modified time: 2020-12-18 15:45:31 + + +import re +import pandas as pd + + +def UniversalControlCheck(input_df): + """ + 模块功能:检查UniversalControl类的语料字段domain、intent、response_text是否正确 + iterrows: 返回值为元组,(index,row) + """ + print('The module of universal_control_check is running!') + + for index, row in input_df.iterrows(): + query = row['query'] + domain = row['domain'] + intent = row['intent'] + response_text = row['response_text'] + # reclassification = row['reclassification'] + + + try: + # 正则表达式匹配数据规律 + query_result = re.search(r'.*?(声音大|音量|小一点|调高|调低|调大|调到|开机|设置24度|静音风|关掉啊|左右扫风|声音调|风速|最大风|小声点|升高|扫风|温度提高|温度调高|温度设|温度2|小点声|上下扫风|调小|大一点|灯光开|降低|关闭吧|有点热|调至|风量|开扫风|最低|最大|增大|关机|上下摇摆|降2|加大|减低|小一些|大些|下调|太冷了|低风|设至|左右风|档|摆风|显示|送风|启动吧|中速|结束吧|调为|强劲风|请停止|关闭吧|关了吧|灯|温度1|度|声音放小一点|关闭语音|语音关掉|小声一点|声音还是太大了|小声一点|声音小点|左右摇摆|声音最小|设为强风|声音再小点|请关闭|设置中风|上下风|给我中风|大声一点|声音减小|风力高风|大点声|给我关了|声音放到最小|减小声音|温度加|我觉得有点冷|帮我打开|左右摇摆|声音太大了|调成大风|风大点|自动风|有点冷|好冷啊|最小风|最高风|调节弱风|让屋里暖和|好热啊|帮我开启吧|静音模式|关掉|风小点|微风|想强风|吵死了|增加风|声大点|开下吹风|声音高一点|声音100分|再小声音|请关掉|大声音|开最小|低速风|低一点|声音放小点|声音低点|第六一集|加强风|最小声音|声音再提高|声音提高|太热了|声音放最小|启动啊|左右摆动|帮我关|请停掉).*', query) + if query_result is not None and '空调' not in query and '平安夜' not in query and '安徒生' not in query and '婴儿' not in query: + if domain == 'UniversalControl': + row['domain_is_right'] = 'yes' + if 'control' in intent: + row['intent_is_right'] = 'yes' + else: + row['intent_is_right'] = 'no' + + try: + row['response_is_right'] = 'yes' + except: + pass + + + yield { + # 'initial_id': row['id'], + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': row['domain_is_right'], + 'intent_is_right': row['intent_is_right'], + 'response_is_right': row['response_is_right'] + } + + else: + pass + else: + pass + except: + pass + + + print('The module of universal_control_check is executed!') diff --git a/autoCheck/weather_check.py b/autoCheck/weather_check.py new file mode 100644 index 0000000000000000000000000000000000000000..76944b332e9e8586c665b8af048542dfb593c9fc --- /dev/null +++ b/autoCheck/weather_check.py @@ -0,0 +1,70 @@ +# -*- coding: utf-8 -*- +# @Author: StudentCWZ +# @Date: 2020-11-30 12:22:54 +# @Last Modified by: Gree +# @Last Modified time: 2020-12-18 15:51:08 + + + +import re +import pandas as pd + + +def WeatherCheck(input_df): + """ + 模块功能:检查weather类的语料字段domain、intent、response_text是否正确 + iterrows: 返回值为元组,(index,row) + """ + print('The module of weather_check is running!') + + for index, row in input_df.iterrows(): + query = row['query'] + domain = row['domain'] + intent = row['intent'] + response_text = row['response_text'] + + try: + # 正则表达式匹配数据规律 + query_result = re.search(r'.*?(天气|气温|天气预报|有雨|温度|多少度|北京天|穿什么衣服|今天广州|热了啊|热不热|下雨|雾霾|带伞|预报|今天会雨天|今天几度|明天呢|一晴气温|现在外面温度|多云|有冷空气|晚上冷不冷).*', query) + if query_result is not None and '首' not in query and '高' not in query and '低' not in query and '设为' not in query and '天气闷热' not in query and '升' not in query and '制冷' not in query and '温度2' not in query and '我想听' not in query and '给爷' not in query and '降' not in query and '有点儿凉' not in query and '直角' not in query and '把温度' not in query and '播放下雨' not in query and '温度加' not in query and '自动风' not in query and '停止' not in query: + if domain == 'weather': + row['domain_is_right'] = 'yes' + if 'search' in intent: + row['intent_is_right'] = 'yes' + else: + row['intent_is_right'] = 'no' + + try: + if response_text is not None and query != '周六的天气空调周六的天气': + row['response_is_right'] = 'yes' + else: + row['response_is_right'] = 'no' + except: + pass + + yield { + # 'initial_id': row['id'], + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': row['domain_is_right'], + 'intent_is_right': row['intent_is_right'], + 'response_is_right': row['response_is_right'] + } + + else: + pass + + else: + pass + except: + pass + + + print('The module of weather_check is executed!') +