diff --git a/README.md b/README.md index 7dedfd4759e31a58d49173fb7c6879f15ab7d5ac..95aff6d8444fb60b15ee50cb3f87aad17d94bdb9 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,63 @@ -# AutoTest +# 项目简介 +- 项目名称:数据自动化分类检测 +- 功能说明:自动化检测数据分类是否正确 +- 代码仓库:https://api.gree.com/gitlab/cuiweizhi/AutoTest.git +- 项目负责人:崔为之 +- 目录结构: +``` +├─period +│ ├─get_time.py // 获取时间的模块(用于从 mysql 获取数据) +| +├─config +| ├─__init__.py +| ├─sql.conf // 连接 mysql 数据的配置文件 +| ├─conn_sql.py // 读取 mysql 配置文件,进行各种数据库操作 +│ +├─data_processing +│ ├─data_deal.py // 用于数据清洗 +| +├─check // 检查各个分类正确性的模块 +| ├─airconditioner_check.py +| ├─ancient_poem_check.py +| ├─chat_check.py +| ├─encyclopedia_check.py +| ├─fm_check.py +| ├─global_control_check.py +| ├─holiday_check.py +| ├─music_check.py +| ├─news_check.py +| ├─play_control_check.py +| ├─science_check.py +| ├─sports_check.py +| ├─stocks_check.py +| ├─translate_check.py +| ├─universal_control_check.py +| ├─weather_check.py +| +| +├─requirement.txt //python3 环境的配置 +| +| +└─main.py // 主体代码 +``` +# 如何运行 +- 创建虚拟环境: python -m venv venv +- 安装软件包: pip -r requirement.txt +- 运行服务: +``` +(1) 修改配置文件:sql.conf 和 main.py 中的参数 day (根据自己需要修改) +(2) 运行服务:python3 main.py +``` +# 版本信息 +``` +v1.0 +``` +# 更新日志 +- v1.0 版本 +``` +(1) 首次创建项目 +(2) 提供技能 +``` + + -This project mainly uses regular expressions to classify voice data. \ No newline at end of file diff --git a/check/__init__.py b/check/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e25a37d954b703ea3cfc302027dc74d069dcc3b8 --- /dev/null +++ b/check/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: utf-8 -*- +# @Author: Gree +# @Date: 2021-06-01 10:34:30 +# @Last Modified by: Gree +# @Last Modified time: 2021-06-02 10:22:44 diff --git a/check/airconditioner_check.py b/check/airconditioner_check.py new file mode 100644 index 0000000000000000000000000000000000000000..00ee7a987b293d9e6e4b1dead3d8df1983bd193a --- /dev/null +++ b/check/airconditioner_check.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- +# @Author: Gree +# @Date: 2021-06-02 11:16:23 +# @Last Modified by: Gree +# @Last Modified time: 2021-06-02 11:26:10 + + +import re +import pandas as pd + + +class AirconditionerCheck: + """空调语料自动化分类的检查""" + def airconditioner_check(self, row): + """ + airconditioner_check 函数: + input: + output: generator + features: 空调语料自动化分类的检查 + step1: 检查 airconditioner 的语料字段 domain、intent、response_text 是否正确 ✅ + """ + # 获取 domain + domain = row['domain'] + # 获取 query + query = row['query'] + # 获取 intent + intent = row['intent'] + # 获取 response_text + response_text = row['response_text'] + # 异常捕获 + try: + # 正则表达式匹配数据规律 + query_result = re.search(r'.*?(空调|模式|制冷|制热|送风|把自动|停止风|打开加热|智能风|为自动|下出风|自清洁|环绕风|开风随|请休息|打开健康|抽湿|休息吧|温度提高5度格力金贝|无风感|祛湿).*', query) + + except Exception as e: + print("The error of getting query_result in the module of airconditioner_check():", e) + + # 异常捕获 + try: + # 条件判断 + if query_result is not None and '音量' not in query and '海洋风' not in query and '天气' not in query and '加热' not in query and '休息吧' not in query and '关闭空调关闭语音' not in query and query != '空调温度' and query != '把空调温度' and query != '下出风' and query != '格力空调最小音量': + # 判断空调关键字是否落在 query + if '空调' in query: + row['domain_is_right'] = 'yes' + + if 'control' in intent: + row['intent_is_right'] = 'yes' + else: + row['intent_is_right'] = 'no' + + if response_text == '': + row['response_is_right'] = 'yes' + else: + pass + + else: + row['domain_is_right'] = 'no' + + if 'control' in intent: + row['intent_is_right'] = 'yes' + else: + row['intent_is_right'] = 'no' + + if response_text == '': + row['response_is_right'] = 'yes' + else: + pass + + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': row['domain_is_right'], + 'intent_is_right': row['intent_is_right'], + 'response_is_right': row['response_is_right'] + } + + else: + + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': "", + 'intent_is_right': "", + 'response_is_right': "" + } + + except Exception as e: + print("The error of getting generator in the module of airconditioner_check():", e) + diff --git a/check/ancient_poem_check.py b/check/ancient_poem_check.py new file mode 100644 index 0000000000000000000000000000000000000000..343462603e829499e9007473c6cb2d4003626385 --- /dev/null +++ b/check/ancient_poem_check.py @@ -0,0 +1,112 @@ +# -*- coding: utf-8 -*- +# @Author: Gree +# @Date: 2021-06-02 13:39:49 +# @Last Modified by: Gree +# @Last Modified time: 2021-06-02 13:48:42 + + +import re +import pandas as pd + + +class AncientPoemCheck: + """古诗语料自动化分类的检查""" + def ancient_poem_check(self, row): + """ + ancient_poem_check 函数: + input: + output: generator + features: 古诗语料自动化分类的检查 + step1: 检查 ancient_poem 的语料字段 domain、intent、response_text 是否正确 ✅ + """ + # 获取 domain + domain = row['domain'] + # 获取 query + query = row['query'] + # 获取 intent + intent = row['intent'] + # 获取 response_text + response_text = row['response_text'] + + # 异常捕获 + try: + # 正则表达式匹配数据规律 + query_result = re.search(r'.*?(唐诗|宋词|下一句|陌上桑|诗|李白|杜甫|王维|王之涣|咏鹅|渔歌子|背|岳阳楼记|乡村四月|陶渊明|白居易|鹿柴|杨万里|李清照|静夜思|绝句|千金散尽|上一句|播放古诗|朗诵一首白|春蚕|朗诵|安得广厦千万间|刘禹锡|杨花落尽|野茫茫|观沧海|陋室铭|月是故乡明|桃花源记|锄禾|前不见古人|清平乐|登幽州台歌|小石潭记|江城子|清明时节|采薇|伯牙鼓琴|沁园春|迢迢牵牛星|踏歌行|咏柳|春望|一剪梅|鹅鹅|离骚|赠汪伦|木兰辞|朗读|卖油翁|孟浩然|枫桥夜泊|终南山|黄鹂鸣翠柳|蒹葭|孙权劝学|四时田园|离离原上草|见客棹歌回|天苍苍|晓出净慈寺|游子吟|子夜吴歌|凤凰台|处处闻啼鸟|烟花三月|凉州词|幽人应未眠|高鼎|白云生处|到西洲|白发三千丈|明月几时有|无边落木萧萧下|举头望明月|已亥杂诗|一岁一枯荣|新安吏|玉阶怨|关山月|过零丁洋|归去来兮|芙蓉楼送辛渐|枯藤老树昏鸦|南屏晚钟|小池|空山新雨后|渭城朝雨|早发白帝城|春眠不觉晓|八阵图|七步诗|题破山寺后禅院|送杜少府之任蜀州|青青子衿|雁门太守行|泊秦淮|播放凤求凰|登飞来峰|逢入京使|春夜洛城|夜雨寄北|忆江南|朱自清|无情未必真豪杰|东风不与周郎便|琵琶行|天涯若比邻|粒粒皆辛苦|夕阳无限好|红军不怕远征难|诗经小雅|敕勒歌|相思红豆生南国|浪淘沙|醉卧沙场君|六月二十七日望湖楼醉书|江南春|西江月|回乡偶书|记承天寺夜游|知否知否|卖炭翁|杜牧|回乡偶书|诗经|田园诗|采莲曲|迎春曲|广乐的诗|范仲淹|贺知章|张九龄|秋风词|黄河入海流|白日依山尽|风流天下闻|江上渔者|短歌行|咏梅|满江红|早知潮有信|花间一壶酒|寒蝉凄切|春寒赐浴华清池|牧童遥指杏花村|归园田居|赤壁怀古|又岂在朝朝暮暮|两情若是久长时|雕栏玉砌应犹在|千山鸟飞绝|常记溪亭日暮|北风卷地白草折|金樽清酒斗十千|苏轼|谁道人生无再少|寻寻觅觅|云母屏风烛影深|离别家乡岁月多|造化钟神秀|何当共剪西窗烛|庄生晓梦迷蝴蝶|李煜).*', query) + + except Exception as e: + print("The error of getting query_result in the module of ancient_poem_check():", e) + + # 异常捕获 + try: + # 条件判断 + if query_result is not None and '背影' not in query and '想听诗歌' not in query and '背包' not in query and '高英' not in query and 'theme背' not in query and '诗为有' not in query and 'june' not in query and '落花诗' not in query and '闭嘴' not in query and '黑锅' not in query and '占廷' not in query and '收听太' not in query and 'fm三' not in query and '唐诗蝉' not in query and '穿条秋裤回家' not in query and '菊花二' not in query and '倪方六' not in query and '第八代' not in query and '三三原则' not in query and '手淫危害' not in query and '正式参战' not in query and 'a上' not in query and 'raze' not in query and '停止' not in query and query != '秋意秋意诗意' and query != '给我背一个白居易' and query != '来一首绝句': + row['domain_is_right'] = 'yes' + + if 'search' in intent: + row['intent_is_right'] = 'yes' + else: + row['intent_is_right'] = 'no' + + if '作品' in response_text or '来自' in response_text or '全文如下' in response_text: + row['response_is_right'] = 'yes' + elif response_text is None: + row['response_is_right'] = 'no' + else: + row['response_is_right'] = 'no' + + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': row['domain_is_right'], + 'intent_is_right': row['intent_is_right'], + 'response_is_right': row['response_is_right'] + } + + elif '全文' in response_text: + row['domain_is_right'] = 'yes' + row['intent_is_right'] = 'yes' + row['response_is_right'] = 'yes' + + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': row['domain_is_right'], + 'intent_is_right': row['intent_is_right'], + 'response_is_right': row['response_is_right'] + } + + else: + + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': "", + 'intent_is_right': "", + 'response_is_right': "" + } + + except Exception as e: + print("The error of getting generator in the module of ancient_poem_check():", e) + + diff --git a/check/chat_check.py b/check/chat_check.py new file mode 100644 index 0000000000000000000000000000000000000000..7310f014c2391b4df7b8e7f8fd177e457e036e0e --- /dev/null +++ b/check/chat_check.py @@ -0,0 +1,260 @@ +# -*- coding: utf-8 -*- +# @Author: Gree +# @Date: 2021-06-02 13:50:19 +# @Last Modified by: Gree +# @Last Modified time: 2021-06-02 13:56:52 + + +import re +import pandas as pd + + +class ChatCheck: + """闲聊语料自动化分类的检查""" + def chat_check(self, row): + """ + chat_check 函数: + input: + output: generator + features: 闲聊语料自动化分类的检查 + step1: 检查 chat 的语料字段 domain、intent、response_text 是否正确 ✅ + """ + # 获取 domain + domain = row['domain'] + # 获取 query + query = row['query'] + # 获取 intent + intent = row['intent'] + # 获取 response_text + response_text = row['response_text'] + + # 捕获异常 + try: + if '温度升' in query or '最大风' in query or '强风' in query or '风小点' in query or '调到低速' in query or '切换到最小温度' in query or '温度放到30度' in query or '设为30度' in query or '高点高风' in query or '风量' in query or '降低20' in query or '调到28度' in query or '风速调到最大' in query or '温度调大到' in query or '风速调到中' in query or '温度调到' in query or '温度达到' in query or '调低低档' in query or '温度升高' in query or '温度调' in query or '调大低档' in query or '关机关闭' in query or '低档调到最低' in query or '调大到30度' in query or '调高中风档' in query or '风速设为自动风' in query or '调大到中风档' in query: + row['domain_is_right'] = 'no' + row['intent_is_right'] = 'no' + row['response_is_right'] = 'no' + + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': row['domain_is_right'], + 'intent_is_right': row['intent_is_right'], + 'response_is_right': row['response_is_right'] + } + + elif '换风模式' in query or '上下扫风' in query or '辅热模式' in query or '制热' in query or '热风' in query or '定向扫风' in query or '空调模式' in query or '制冷' in query or '暖空调' in query or '采暖的' in query or '热空调' in query or '暖风' in query or '打开空调' in query or '关闭空调' in query or '暖风' in query or '空调温度' in query or '空调风档' in query or '空调风速设' in query or '格力空调有点冷' in query or '自动风速格力空调' in query or '降低20格力空调' in query or '关掉空调' in query or '关闭格力空调' in query or '格力空调关闭' in query or '将开空调' in query or '最小风量空调' in query: + row['domain_is_right'] = 'no' + row['intent_is_right'] = 'no' + row['response_is_right'] = 'no' + + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': row['domain_is_right'], + 'intent_is_right': row['intent_is_right'], + 'response_is_right': row['response_is_right'] + } + + + elif ('丑小鸭' in query or '安徒生' in query or '三国演义' in query or '电台' in query) and '几点钟' not in query: + row['domain_is_right'] = 'no' + row['intent_is_right'] = 'no' + row['response_is_right'] = 'no' + + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': row['domain_is_right'], + 'intent_is_right': row['intent_is_right'], + 'response_is_right': row['response_is_right'] + } + + + elif ('乘以' in query or '除以' in query) and '歌行' not in query: + row['domain_is_right'] = 'no' + row['intent_is_right'] = 'no' + row['response_is_right'] = 'no' + + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': row['domain_is_right'], + 'intent_is_right': row['intent_is_right'], + 'response_is_right': row['response_is_right'] + } + + elif ('唐诗' in query or '古诗' in query or '白居易' in query or '陌上桑' in query or '李煜' in query or '诗歌' in query or '杜甫' in query or '乐府诗' in query) and '自动风' not in query and '最小' not in query and '国庆节' not in query: + row['domain_is_right'] = 'no' + row['intent_is_right'] = 'no' + row['response_is_right'] = 'no' + + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': row['domain_is_right'], + 'intent_is_right': row['intent_is_right'], + 'response_is_right': row['response_is_right'] + } + + + elif ('平安夜' in query or '是万圣节' in query or '十二月二十六日是' in query or '一月二十五号是' in query or '十二月二十二日是' in query or '南瓜节' in query or '10月31日万圣节' in query) and query != '平安夜部' and query != '平安夜吗' and query != '不是是平安夜': + row['domain_is_right'] = 'no' + row['intent_is_right'] = 'no' + row['response_is_right'] = 'no' + + + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': row['domain_is_right'], + 'intent_is_right': row['intent_is_right'], + 'response_is_right': row['response_is_right'] + } + + + elif '天气' in query and '新闻' not in query: + row['domain_is_right'] = 'no' + row['intent_is_right'] = 'no' + row['response_is_right'] = 'no' + + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': row['domain_is_right'], + 'intent_is_right': row['intent_is_right'], + 'response_is_right': row['response_is_right'] + } + + + elif ('热点新' in query or '新闻联播' in query or '打开头条' in query) and '音量' not in query and '模式' not in query and '天气' not in query: + row['domain_is_right'] = 'no' + row['intent_is_right'] = 'no' + row['response_is_right'] = 'no' + + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': row['domain_is_right'], + 'intent_is_right': row['intent_is_right'], + 'response_is_right': row['response_is_right'] + } + + + elif '刘德华' in query or '点一首' in query or '广东不下雪' in query or '塔斯肯的铃声' in query or '小兔子乖乖' in query or '如果高兴你就拍拍手' in query or '笑起来真好看' in query or '不过人间' in query or '漂洋过海来看你' in query or '江南style' in query or '桥边的姑娘' in query or '海来阿木' in query or '百鸟朝凤' in query or '谢谢你的爱' in query or '可可托海的牧羊人' in query: + row['domain_is_right'] = 'no' + row['intent_is_right'] = 'no' + row['response_is_right'] = 'no' + + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': row['domain_is_right'], + 'intent_is_right': row['intent_is_right'], + 'response_is_right': row['response_is_right'] + } + + + elif ('关闭刚才播放' in query or '切换到薛之谦' in query or '关闭音乐' in query or '打开酷狗' in query or '银河奥特曼' in query or '锦衣之下' in query or '音乐也大一点' in query or '单曲循环' in query or '关闭语音' in query or '最小音量' in query or '音乐调小' in query or '皮卡丘' in query or '春光灿烂猪八戒' in query or '赛罗奥特曼格斗' in query or '萌鸡小队' in query or '三嫁惹君心' in query or '蜘蛛侠' in query or '播放梦幻奇缘' in query or '播放甲午中日战争' in query or '小猴子爬山' in query or '火星情报局' in query or '播放留言' in query or '播放音乐' in query or '声音关小一点' in query or '关掉音乐' in query or '请播交响乐' in query or '音乐放小点' in query or '音乐调大' in query or '关闭相声' in query) and '12月26' not in query and '几日' not in query and '新闻' not in query: + row['domain_is_right'] = 'no' + row['intent_is_right'] = 'no' + row['response_is_right'] = 'no' + + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': row['domain_is_right'], + 'intent_is_right': row['intent_is_right'], + 'response_is_right': row['response_is_right'] + } + + else: + + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': "", + 'intent_is_right': "", + 'response_is_right': "" + } + + except Exception as e: + print("The error of getting generator in the module of chat_check():", e) + diff --git a/check/encyclopedia_check.py b/check/encyclopedia_check.py new file mode 100644 index 0000000000000000000000000000000000000000..5c2d6326b0854d5ffa3411f2dc31dc8601d0ce74 --- /dev/null +++ b/check/encyclopedia_check.py @@ -0,0 +1,95 @@ +# -*- coding: utf-8 -*- +# @Author: Gree +# @Date: 2021-06-02 13:57:06 +# @Last Modified by: Gree +# @Last Modified time: 2021-06-02 14:02:29 + + +import re +import pandas as pd + + +class EncyclopediaCheck: + """百科语料自动化分类的检查""" + def encyclopedia_check(self, row): + """ + encyclopedia_check 函数: + input: + output: generator + features: 百科语料自动化分类的检查 + step1: 检查 baike 的语料字段 domain、intent、response_text 是否正确 ✅ + """ + # 获取 domain + domain = row['domain'] + # 获取 query + query = row['query'] + # 获取 intent + intent = row['intent'] + # 获取 response_text + response_text = row['response_text'] + + # 捕获异常 + try: + if "search_baike" in intent: + if query == "采蘑菇" or query == '光头强' or query == '风雨彩虹': + row['domain_is_right'] = 'no' + row['intent_is_right'] = 'no' + row['response_is_right'] = 'no' + + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': row['domain_is_right'], + 'intent_is_right': row['intent_is_right'], + 'response_is_right': row['response_is_right'] + } + + + else: + row['domain_is_right'] = 'yes' + row['intent_is_right'] = 'yes' + row['response_is_right'] = 'yes' + + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': row['domain_is_right'], + 'intent_is_right': row['intent_is_right'], + 'response_is_right': row['response_is_right'] + } + + else: + + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': "", + 'intent_is_right': "", + 'response_is_right': "" + } + + + except Exception as e: + print("The error of getting generator in the module of encyclopedia_check():", e) + diff --git a/check/fm_check.py b/check/fm_check.py new file mode 100644 index 0000000000000000000000000000000000000000..caca5c89b3520ddba5f9783526cf77ae247a7bd3 --- /dev/null +++ b/check/fm_check.py @@ -0,0 +1,94 @@ +# -*- coding: utf-8 -*- +# @Author: Gree +# @Date: 2021-06-02 14:02:59 +# @Last Modified by: Gree +# @Last Modified time: 2021-06-02 14:05:41 + + +import re +import pandas as pd + + +class FmCheck: + """电台语料自动化分类的检查""" + def fm_check(self, row): + """ + fm_check 函数: + input: + output: generator + features: 电台语料自动化分类的检查 + step1: 检查 fm 的语料字段 domain、intent、response_text 是否正确 ✅ + """ + # 获取 domain + domain = row['domain'] + # 获取 query + query = row['query'] + # 获取 intent + intent = row['intent'] + # 获取 response_text + response_text = row['response_text'] + + # 异常捕获 + try: + # 正则表达式匹配数据规律 + query_result = re.search(r'.*?(故事|白雪公主的|白雪公主讲|评书|小品|相声|京剧|美文之声|广播|笑林|黑猫警长|奥特曼|葫芦娃|fm|星辰变|童话格林|卖火柴|下一个三字经|水浒|频道|西游记|西厢记|小说|主播|个台|下一集|下一段|下一章|下一张|之声|收听|兆|FM|格林童话|戏曲|三字经|冰雪奇缘|京戏|郭德纲|集|丑小鸭|阿拉丁|米小圈|萌鸡小队|虾球传|经济杂谈|调频|亿万老婆买一送一|梁祝|袁阔成|三国演义|小猪佩奇|黄梅戏|电台|冬吴同学会|猪猪侠|游园惊梦|睡前故事|电视剧|青华浮梦|安徒生|二人转|下一节|脱口秀|海底小纵队).*', query) + + except Exception as e: + print("The error of getting query_result in the module of fm_check():", e) + + # 异常捕获 + try: + # 条件判断 + if query_result is not None and '关' not in query and '体操' not in query and '爱情故事' not in query and '建成了' not in query and '丰田' not in query and '配置' not in query and '研究' not in query and '走私' not in query and '黑锅' not in query and '奇异' not in query and '会议' not in query and '贞操' not in query and '你妹啊' not in query and '音量' not in query and '语音' not in query and '停止' not in query and query != '不听故事' and query != '唱的故事' and query != '请找私有制的故事' and query != '播放来的故事' and query != '应该剪个两只小老虎的故事': + row['domain_is_right'] = 'yes' + + if intent is not None: + row['intent_is_right'] = 'yes' + else: + row['intent_is_right'] = 'no' + + if response_text is None or '抱歉' in response_text or '绘本' in response_text or '推荐那些事' in response_text or '跳泥坑' in response_text or '叮当头条' in response_text or '保持独立' in response_text or '来读诗' in response_text or '来听听男一号吧' in response_text or '推荐你听听最近很火的湖北传统采茶戏' in response_text or '魔镜魔镜告诉我' in response_text: + row['response_is_right'] = 'no' + else: + row['response_is_right'] = 'yes' + + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': row['domain_is_right'], + 'intent_is_right': row['intent_is_right'], + 'response_is_right': row['response_is_right'] + } + + else: + + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': "", + 'intent_is_right': "", + 'response_is_right': "" + } + + except Exception as e: + print("The error of getting generator in the module of fm_check():", e) + + + + + + diff --git a/check/global_control_check.py b/check/global_control_check.py new file mode 100644 index 0000000000000000000000000000000000000000..1fcb330b04d71218c279efd5563ccbc0e7e496a1 --- /dev/null +++ b/check/global_control_check.py @@ -0,0 +1,73 @@ +# -*- coding: utf-8 -*- +# @Author: Gree +# @Date: 2021-06-02 14:05:59 +# @Last Modified by: Gree +# @Last Modified time: 2021-06-02 14:11:14 + + +import re +import pandas as pd + + +class GlobalControlCheck: + """全局控制语料自动化分类的检查""" + def global_control_check(self, row): + """ + global_control_check 函数: + input: + output: generator + features: 全局控制语料自动化分类的检查 + step1: 检查 global_control 的语料字段 domain、intent、response_text 是否正确 ✅ + """ + # 获取 domain + domain = row['domain'] + # 获取 query + query = row['query'] + # 获取 intent + intent = row['intent'] + # 获取 response_text + response_text = row['response_text'] + + # 捕获异常 + try: + # 条件判断 + if ('停止播放' in query or '关闭新闻' in query or '关掉新闻' in query or '关新闻' in query or '给我播放' in query or '音乐声音' in query or '音乐' in query or '暂停播放' in query or '关闭静音' in query or '声音关掉' in query or '大声点' in query or '调小一点' in query or '音小点' in query or '声音小一点' in query or '小声点' in query or '加大一点' in query or '声音不大' in query or '放大声点' in query or '调小点' in query or '增大音' in query or '放大' in query or '开到最小' in query or '音量' in query or '声音关' in query or '声音轻点' in query or '大点声' in query or '不听了' in query or '关闭语音' in query or '音低点' in query or '声音调' in query or '声音响' in query or '关小声' in query or '音小一点' in query or '声音大一点' in query or '声音再大' in query or '放小声音' in query or '声音大些' in query or '小点声儿' in query or '关闭声音' in query) and '打开军' not in query and '温度调到' not in query and '严肃处理' not in query and '在放手' not in query and '你还在讲' not in query and '学校' not in query and '入竹' not in query and '性交' not in query and '爆料' not in query and '关机' not in query and '饮料' not in query and '6哲' not in query and '最小风' not in query and '一线城市' not in query and '听不懂我讲话' not in query and '买' not in query and '小乖' not in query and '礼尚往来' not in query and '什么美食' not in query and '毕业' not in query and '小美' not in query and '小麦' not in query and '花儿也谢了' not in query and '打开9档' not in query and '解冻' not in query and '小白' not in query and '选择wifi' not in query and '小伙子' not in query and '新闻综合' not in query and '诗' not in query and '关系到' not in query and '度' not in query and '一月' not in query and '国家' not in query and '美' not in query and '妹' not in query and '管家' not in query and '小微' not in query and '反应' not in query and '祖国' not in query and '油烟机' not in query and '爸' not in query and '叶月' not in query and '不是我不听了' not in query and '系统安全' not in query and '耕地' not in query and '我感觉你很大' not in query and '协议' not in query and '南瓜' not in query and '万家' not in query and '扫风' not in query and '狗' not in query and '风' not in query and '窗帘' not in query and '清洁' not in query and '寡人' not in query and '亲历' not in query and '阴蒂' not in query and '固始县' not in query and '霸业' not in query and '颈椎' not in query and '埃及' not in query and '一加一' not in query: + row['domain_is_right'] = 'no' + row['intent_is_right'] = 'no' + row['response_is_right'] = 'no' + + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': row['domain_is_right'], + 'intent_is_right': row['intent_is_right'], + 'response_is_right': row['response_is_right'] + } + + else: + + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': "", + 'intent_is_right': "", + 'response_is_right': "" + } + + except Exception as e: + print("The error of getting generator in the module of global_control_check():", e) + diff --git a/check/holiday_check.py b/check/holiday_check.py new file mode 100644 index 0000000000000000000000000000000000000000..731bbfd02c73e57403a978e51a33dea059ce2e30 --- /dev/null +++ b/check/holiday_check.py @@ -0,0 +1,91 @@ +# -*- coding: utf-8 -*- +# @Author: Gree +# @Date: 2021-06-02 14:11:40 +# @Last Modified by: Gree +# @Last Modified time: 2021-06-02 14:21:10 + + +import re +import pandas as pd + + +class HolidayCheck: + def holiday_check(self, row): + """ + holiday_check 函数: + input: + output: generator + features: 节日语料自动化分类的检查 + step1: 检查 holiday 的语料字段 domain、intent、response_text 是否正确 ✅ + """ + # 获取 domain + domain = row['domain'] + # 获取 query + query = row['query'] + # 获取 intent + intent = row['intent'] + # 获取 response_text + response_text = row['response_text'] + + # 异常捕获 + try: + # 正则表达式匹配数据规律 + query_result = re.search(r'.*?(多少点|几点|几日|几月|农历|现在时间|北京时间|什么时间|多少天|几天|礼拜|阳历|节日|[0-9]月|多少号|现在的时间|多长时间|今天星期几|今天几号|是初几|周几|什么时候|到过年|春节|圣诞节|还有多久过年|星期几|平安夜|国庆节|什么日子|过年|除夕|什么时候|看一下时间|父亲节|教师节|建党节|差几天|周几|父亲节|多久放假|几号|哪一天|哪天是|儿童节|元旦|劳动|清明节|元宵节|情人节|腊八节|重阳节|看一下时间|端午节|妇女节|中秋节|过年|愚人节|号还有多久|植树节|放几天|秋分|万圣节|母亲节|几时几分|一月|国际禁毒日|冬至|明天的|什么节).*', query) + + except Exception as e: + print("The error of getting query_result in the module of holiday_check():", e) + + + # 异常捕获 + try: + # 条件判断 + if query_result is not None and '几点起' not in query and '肯德基' not in query and '温度' not in query and '22度' not in query and '汇率' not in query and '剑杰' not in query and '新闻' not in query and '除以' not in query and '来一首' not in query and '技能' not in query and '张北北的歌' not in query and '停止' not in query and '音量' not in query and '现在北京12月' not in query and query != '一月二十' and query != '安阳16日是什么日子' and query != '春节有多少天' and query != '明天的' and query != '冬至是哪' and query != '冬至有多少天' and query != '12月22日' and query != '一月二十九号' and query != '还有多少天帮我查一查' and query != '1月25号' and query != '四日是平安夜12月24日是平安夜吗': + row['domain_is_right'] = 'yes' + + if 'search' in intent: + row['intent_is_right'] = 'yes' + else: + row['intent_is_right'] = 'no' + + if response_text is not None and '我暂时' not in response_text and '春节是春节' not in response_text: + row['response_is_right'] = 'yes' + else: + row['response_is_right'] = 'no' + + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': row['domain_is_right'], + 'intent_is_right': row['intent_is_right'], + 'response_is_right': row['response_is_right'] + } + + else: + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': "", + 'intent_is_right': "", + 'response_is_right': "" + } + + except Exception as e: + print("The error of getting generator in the module of holiday_check():", e) + + + + diff --git a/check/music_check.py b/check/music_check.py new file mode 100644 index 0000000000000000000000000000000000000000..74fa13bb66937dad889a51666f641081d08bbd25 --- /dev/null +++ b/check/music_check.py @@ -0,0 +1,95 @@ +# -*- coding: utf-8 -*- +# @Author: Gree +# @Date: 2021-06-02 14:16:51 +# @Last Modified by: Gree +# @Last Modified time: 2021-06-02 14:21:05 + + +import re +import pandas as pd + + +class MusicCheck: + """音乐语料自动化分类的检查""" + def music_check(self, row): + """ + music_check 函数: + input: + output: generator + features: 音乐语料自动化分类的检查 + step1: 检查 music 的语料字段 domain、intent、response_text 是否正确 ✅ + """ + # 获取 domain + domain = row['domain'] + # 获取 query + query = row['query'] + # 获取 intent + intent = row['intent'] + # 获取 response_text + response_text = row['response_text'] + + # 异常捕获 + try: + # 正则表达式匹配数据规律 + query_result = re.search(r'.*?(播放|音乐|摇滚乐|一首|歌|刘德华|点首|给我放|歌曲|周杰伦|想听|李健|伍佰|张学友|来首|酒醉的蝴蝶|陈奕迅|郑丽媛|要听|降央|不爱我就别伤害我|放|筷子兄弟|林俊杰|林忆莲|唱|邓丽君|韩宝仪|换首|放一首爱不停歇|奢香夫人|音乐海浪|陈百强|草原绿了|下载原谅|小苹果|她的背影|放那个音乐|放最近|小小的太阳|等你等了那么久|英文歌|换慢歌|爱的罗曼史|你的风景|我的唇吻不到我爱的人|换首少年|泉水叮咚|大悲咒|祝你生日快乐|后来遇见他|我的梦英文版|刀郎|我想父亲的草原|周华健|痴心换情深|来生再去拥抱你|薛之谦的怪咖|爱你一万年|高胜美的缘|周思涵|远走高飞|周深的|我想听下雨天|再见只是陌生人|寂寞是你给的苦|一个人挺好|桥边姑娘|帮你首情人|朴彩英|小手拍拍|播放一点|没有你陪伴真的好孤单|讲不出再见|夏天的风|天空之城|我们都一样|播放冷漠的|来一首莫文蔚|张冬玲的牛在飞|忘情牛肉面|花桥流水|百花香|孟婆的碗|相思的债|一生回味一面|射雕英雄传插曲|后来遇见他|播放最近流行歌曲|黄凯芹的晚秋|韩磊的|播放古筝版权御天下|野狼disco|陈粒的走马|随便来一曲|后生仔|惜别的海岸|让我欢喜让我忧|你偷走了我的心|陈冠希的战争|唱歌不忘阶级苦|唱亲爱的你在哪里|毛不易的春边|古朗月行|毛不易的歌|宿命传说主题曲|孙露的|whatarewords|海草舞|播放一年级|再回到从前|站在草原望北京|阿果吉曲|笑看风云|海阔天空|王晓天的荣耀|华晨宇唱的哪吒|情缘等足一辈子|播放bigbang的loser|黄玫瑰|自由飞翔|不变的情缘|两个人的回忆一个人过|学猫叫|恋人心|记得忘记|郑智化的水手|放那个老妹你真美|播放大哲|放一首下定决心忘记你|千与千寻|你是我的妞|无法忘记你|手心里的温柔|外婆的澎湖湾|放小阿枫的歌|你的样子|落花诗图|小英雄大肚腩|爱在记忆中找你|白龙马|止战之殇|王杰的|放非酋|不变的情缘|来一曲这条街|山水组合|达拉崩|听一个人|换五月天的歌|古典音乐|把音乐打开|安静一点的音乐|放最近|小小的太阳|请播项羽|小老鼠上灯台|这首音乐叫什么|我想听下雨天|宝贝宝贝|林烁的惊雷|听抖音|莫文蔚|播放小螺号|播放最近流行歌曲|给我播放大王叫我来巡山|爱我就抱抱我|唱歌不忘阶级苦|你是人间四月天|你给我听听吗|邓紫棋的泡沫|潇洒走一回|一曲红尘|你笑起来真好看|放一首桃花运|帮我放小星星|跟你聊天就是想听听你的声音|播放山水组合|放那个音乐|一曲相思|丢了你|冷漠的|三水组合|琵琶语|小手拍拍|来曲春天|萨克斯|播放大哲|田一名的李莫愁|播放小白兔白又白|放小阿枫的歌|落花诗图|爱我不要丢下我|我的天空|帮好听的歌|梦回云南|千里之外|忘情水|把酒倒满|龙卷风|淋雨一直走|萨日朗|等你三千年|借酒浇愁|罗大佑的恋|死心塌地|甜蜜蜜|最炫民族风|林中的鸟|大河向东流|门丽|萨顶顶|张惠妹|大王叫我来巡山|刘三姐|想你的时候问月亮|大约在冬季|韩红|野花香|告白气球|醉赤壁|平凡之路|主题曲|兰花草|男儿当自强|帝女花|无人之岛|小城故事|赵雷|陶哲|陪你去流浪|万水千山总是情|恋曲|快乐崇拜|炸山姑娘|张碧晨|青藏高原|阿杜|黑鸭子|冬天里的一把火|两只老虎|爱我你就抱抱我|林宥嘉|陈蓉晖|叶倩文|虫儿飞|腾格尔|亲爱的你在哪里|王菲|隐形的翅膀|一剪梅|邓紫棋友谊地久天长|张国荣|当爱已成往事|风继续吹|草蜢的失恋阵线联盟|夫妻双双把家还|这条街|爱情这杯酒谁喝都得醉|蒋雪儿|小白兔白又白|张韶涵最新单曲|爱江山更爱美人|人的一生|月牙湾|将近酒|小鸡小鸡|青木林|邓紫棋|播数鸭子|江南style|每一个明天|万水千山|那就这样吧|春秋|小鸭子|我和我的祖国|来曲送亲|爱的路上千万里|另一种乡愁|来个孙露|点中文榜|请珍惜|李良|徐小凤|世上只有妈妈好|咖喱咖喱|听我说谢谢你|小蝌蚪找妈妈|我最亲爱的|忘川彼岸|无言的结局|山谷里的思念|霸王别姬|必杀技|刘文正|人间四月天|普通DISCO|李克勤|黑猫警长|我的祖国|听个小芳|风吹麦浪|么么哒|听少年|你到底爱谁|生日快乐|听万玲琳|迷人的危险|lostrivers|李昕融|你是我的唯一|陶晶莹|谢谢你因为有你|小白兔乖乖|郭峰|小阿枫|你的答案|独角戏|我把真心给了你|姜育恒|今生相爱|友谊地久天长|刘若英|光良的童话|错的时间遇见对的你|拥抱你离去|永不失联的爱|陪你一起去草原|爱江山更爱美人|我的快乐就是想你|命运交响曲|蓝色的多瑙河|北国之春|思念情缘|万爱千恩|播个龙王|itsok|张宇|最美的伤口|石头剪刀布|一曲回家|听和兰花在一起|职迷不误|梦中的婚礼|许巍|一起走过的日子|姑娘我爱你|就是爱你|我要你|海来阿木的|廖健|高天上流云|群星的乐曲|兄弟想你了|刘和刚|茉莉花|高山流水|好日子|记得咱的家|明天会更好|秋裤大叔|神奇的九寨|阿里阿里|小燕子|泥娃娃|兔子舞|理查德克莱德曼|黑龙的感谢|点燃一根烟|我的好妈妈|小跳蛙|小兔子乖乖|社会摇|葫芦娃|红马鞍|搞笑漓江曲).*', query) + + except Exception as e: + print("The error of getting query_result in the module of music_check():", e) + + # 异常捕获 + try: + # 条件判断 + if query_result is not None and '关闭' not in query and '别唱' not in query and '音乐关掉' not in query and '德云社' not in query and '音乐小镇点' not in query and '取消音乐' not in query and '音乐关' not in query and '恋恋不忘' not in query and '唐诗' not in query and '歌给关' not in query and '静夜思' not in query and '音乐增' not in query and query != '播放儿童' and query != '请播放话题' and query != '播放莫言' and query != '我要听童话故' and query != '唱onei千恩万2000' and query != '懂得珍惜才配拥有歌曲' and query != '点一首真的爱你' and query != '播放赛罗奥特曼' and query != '点一首那个' and query != '来一首幸福的爱' and query != '不唱歌啦' and query != '我要听慢歌' and query != '给我放山楂树之恋' and query != '帮我播慢歌' and query != '我想听可口可乐还能不能愉' and query != '音乐85' and query != '播放肖娜' and query != '播放没志气' and query != '我想听毛下' and query != '林宥嘉少女' and query != '播放卡拉鸡' and query != '播放半是蜜糖半是伤' and query != '请播放一首唱古文' and query != '播放双笙的我的一个道姑朋友' and query != '放下沙' and query != '我想听赛罗奥特曼': + row['domain_is_right'] = 'yes' + + if 'play' in intent or 'pause' in intent or 'next' in intent or 'search' in intent or 'choose' in intent or 'add' in intent: + row['intent_is_right'] = 'yes' + else: + row['intent_is_right'] = 'no' + + # 正则表达式 + response_result = re.search(r'.*?(一首|听|欣赏|播放|送给你|推荐|歌|翻到|开始|好的|曲库|一起|马上为|找到).*', response_text) + + if response_result is not None and '抱歉' not in response_text and '人类的语言真是太复杂了' not in response_text and 'AankhenKhuli' not in response_text and '大大泡泡糖' not in response_text and '格力和你一起听奶茶' not in response_text and '是不是薛之谦的我害怕' not in response_text: + row['response_is_right'] = 'yes' + else: + row['response_is_right'] = 'no' + + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': row['domain_is_right'], + 'intent_is_right': row['intent_is_right'], + 'response_is_right': row['response_is_right'] + } + + else: + + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': "", + 'intent_is_right': "", + 'response_is_right': "" + } + + except Exception as e: + print("The error of getting generator in the module of music_check():", e) + + + + diff --git a/check/news_check.py b/check/news_check.py new file mode 100644 index 0000000000000000000000000000000000000000..406c0b73a0dcfa6a431ae5109b9d6d00e2cea181 --- /dev/null +++ b/check/news_check.py @@ -0,0 +1,90 @@ +# -*- coding: utf-8 -*- +# @Author: Gree +# @Date: 2021-06-02 14:19:27 +# @Last Modified by: Gree +# @Last Modified time: 2021-06-02 14:21:01 + + +import re +import pandas as pd + + +class NewsCheck: + """新闻语料自动化分类的检查""" + def news_check(self, row): + """ + news_check 函数: + input: + output: generator + features: 新闻语料自动化分类的检查 + step1: 检查 news 的语料字段 domain、intent、response_text 是否正确 ✅ + """ + # 获取 domain + domain = row['domain'] + # 获取 query + query = row['query'] + # 获取 intent + intent = row['intent'] + # 获取 response_text + response_text = row['response_text'] + + # 异常捕获 + try: + # 正则表达式匹配数据规律 + query_result = re.search(r'.*?(新闻|今日头条).*', query) + + except Exception as e: + print("The error of getting query_result in the module of news_check():", e) + + # 异常捕获 + try: + # 条件判断 + if query_result is not None and '等于' not in query and '周杰伦' not in query and '制冷' not in query and query != '我要听最新' and query != '打开新闻' and query != '我要听热门' and query != '格力空调读新闻': + row['domain_is_right'] = 'yes' + + if 'search' in intent: + row['intent_is_right'] = 'yes' + else: + row['intent_is_right'] = 'no' + + + if response_text is not None: + row['response_is_right'] = 'yes' + else: + row['response_is_right'] = 'no' + + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': row['domain_is_right'], + 'intent_is_right': row['intent_is_right'], + 'response_is_right': row['response_is_right'] + } + + else: + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': "", + 'intent_is_right': "", + 'response_is_right': "" + } + + except Exception as e: + print("The error of getting generator in the module of news_check():", e) + + diff --git a/check/play_control_check.py b/check/play_control_check.py new file mode 100644 index 0000000000000000000000000000000000000000..4b5730a9a0f5eba9dc7448194dbe1e47ef9b2965 --- /dev/null +++ b/check/play_control_check.py @@ -0,0 +1,93 @@ +# -*- coding: utf-8 -*- +# @Author: Gree +# @Date: 2021-06-02 14:21:58 +# @Last Modified by: Gree +# @Last Modified time: 2021-06-02 14:30:30 + + +import re +import pandas as pd + + +class PlayControlCheck: + """播放控制语料自动化分类的检查""" + def play_control_check(self, row): + """ + play_control_check 函数: + input: + output: generator + features: 播放控制语料自动化分类的检查 + step1: 检查 play_control 的语料字段 domain、intent、response_text 是否正确 ✅ + """ + # 获取 domain + domain = row['domain'] + # 获取 query + query = row['query'] + # 获取 intent + intent = row['intent'] + # 获取 response_text + response_text = row['response_text'] + + # 捕获异常 + try: + # 正则表达式匹配数据规律 + query_result = re.search(r'.*?(关闭播放|关闭音乐|停止音乐|换一|下一首|别唱了|音乐关|暂停|下一曲|换歌|继续播放|换音乐|关掉音乐|上一曲|不想听|关闭歌曲|不要放|换首歌|继续唱|请切|单曲循环|关闭故事|停止播放|关掉歌曲|歌曲关掉|切音乐|上一首歌|继续的|退出音乐|音乐停止|请播放下|播放上一首|换首音乐|下一个|把停止|这首歌是|换个歌|继续播|是什么歌|请播放|播放下|关音乐|关掉关掉音乐|打开音乐|请关了音乐|不听歌了|请关关闭|把歌声关掉|关闭关闭|把音乐打开|不听了|关了音乐|取消歌曲|调最小音乐|关闭儿歌|退出播放|停止放音乐|关掉故事|继续音乐|重新播放|换个音乐|接着播放|切儿歌|继续|换首|不要音乐|歌曲关闭|歌名|这首歌|关掉歌|退出歌曲|换个音乐|换儿歌|故事关了|下一集|上一首|播放个仔停止|换个儿歌|关闭歌|停止我要听|播放音乐|停止放歌|哪个朝代的|这诗谁写的|作者是谁).*', query) + + except Exception as e: + print("The error of getting query_result in the module of play_control_check():", e) + + # 捕获异常 + try: + # 条件判断 + if query_result is not None and '等于' not in query and '乘' not in query and '最小风' not in query and '有点冷' not in query and '查一查' not in query and '切换到一' not in query and '切换到二' not in query and '停止我要听童话故事' not in query and '停止播放我要听故事' not in query and '关闭语音暂停播放' not in query and '几点钟' not in query and '这诗' not in query and '帮我放下' not in query and '帮我播放下' not in query and '请播放下歌' not in query and query != '是哪个朝代的' and query != '作者是谁' and query != '这首歌叫什么名字': + row['domain_is_right'] = 'yes' + + if 'control' in intent or 'query' in intent: + row['intent_is_right'] = 'yes' + else: + row['intent_is_right'] = 'no' + + try: + if '学艺不精' not in response_text: + row['response_is_right'] = 'yes' + else: + row['response_is_right'] = 'no' + except: + pass + + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': row['domain_is_right'], + 'intent_is_right': row['intent_is_right'], + 'response_is_right': row['response_is_right'] + } + + else: + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': "", + 'intent_is_right': "", + 'response_is_right': "" + } + + except Exception as e: + print("The error of getting generator in the module of play_control_check():", e) + + + diff --git a/check/science_check.py b/check/science_check.py new file mode 100644 index 0000000000000000000000000000000000000000..2ac18659f371a19d04ca0078537a87ff5711fd34 --- /dev/null +++ b/check/science_check.py @@ -0,0 +1,93 @@ +# -*- coding: utf-8 -*- +# @Author: Gree +# @Date: 2021-06-02 14:31:12 +# @Last Modified by: Gree +# @Last Modified time: 2021-06-02 14:34:55 + + +import re +import pandas as pd + + +class ScienceCheck: + """科学语料自动化分类的检查""" + def science_check(self, row): + """ + science_check 函数: + input: + output: generator + features: 科学语料自动化分类的检查 + step1: 检查 science 的语料字段 domain、intent、response_text 是否正确 ✅ + """ + # 获取 domain + domain = row['domain'] + # 获取 query + query = row['query'] + # 获取 intent + intent = row['intent'] + # 获取 response_text + response_text = row['response_text'] + + # 异常捕获 + try: + # 正则表达式匹配数据规律 + query_result = re.search(r'.*?(加|减|乘|除|等于|次方|光年|换算|长度).*', query) + + except Exception as e: + print("The error of getting query_result in the module of science_check():", e) + + # 异常捕获 + try: + # 条件判断 + if query_result is not None and '站在' not in query: + row['domain_is_right'] = 'yes' + + if 'calculator' in intent or 'unit_' in intent: + row['intent_is_right'] = 'yes' + else: + row['intent_is_right'] = 'no' + + + if ('等于' in response_text or '单位' in response_text or '做不出来' in response_text) and '24乘以50384乘以59等于' not in query: + row['response_is_right'] = 'yes' + else: + row['response_is_right'] = 'no' + + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': row['domain_is_right'], + 'intent_is_right': row['intent_is_right'], + 'response_is_right': row['response_is_right'] + } + + else: + + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': "", + 'intent_is_right': "", + 'response_is_right': "" + } + + + except Exception as e: + print("The error of getting generator in the module of science_check():", e) + + + diff --git a/check/sports_check.py b/check/sports_check.py new file mode 100644 index 0000000000000000000000000000000000000000..69f551e27a2705cd949261b475b1b194ab81fb2d --- /dev/null +++ b/check/sports_check.py @@ -0,0 +1,97 @@ +# -*- coding: utf-8 -*- +# @Author: Gree +# @Date: 2021-06-02 14:35:10 +# @Last Modified by: Gree +# @Last Modified time: 2021-06-02 14:38:01 + + +import re +import pandas as pd + + +class SportsCheck: + """体育语料自动化分类的检查""" + def sports_check(self, row): + """ + sports_check 函数: + input: + output: generator + features: 体育语料自动化分类的检查 + step1: 检查 sports 的语料字段 domain、intent、response_text 是否正确 ✅ + """ + # 获取 domain + domain = row['domain'] + # 获取 query + query = row['query'] + # 获取 intent + intent = row['intent'] + # 获取 response_text + response_text = row['response_text'] + + # 捕获异常 + try: + # 正则表达式匹配数据规律 + query_result = re.search(r'.*?(赛事|比赛|nba|cba|中超|詹姆斯|科比|乔丹|国安|英超|切尔西|对阵|阿森纳|塞尔利亚人|冠军|足球|湖人|决赛|巴斯坦人|广州恒大|女排|火箭|联赛|皇马|梅西|雷霆|拜仁|曼联|奥运会|森林狼|凯尔特人|世界杯|欧洲杯|公牛队|活塞队|老鹰|步行者|尤文图斯|赢了|比分|阿斯特拉|多少分|杜兰特|篮网|热火|猛龙|灰熊|欧联杯|骑士|热刺|对手|皇家马德里|队|巴萨|美洲杯|利物浦|vs|ac米兰).*', query) + + except Exception as e: + print("The error of getting query_result in the module of sports_check():", e) + + + # 捕获异常 + try: + # 条件判断 + if query_result is not None: + row['domain_is_right'] = 'yes' + + if 'search' in intent: + row['intent_is_right'] = 'yes' + else: + row['reply_is_right'] = 'no' + + try: + if response_text is not None: + row['response_is_right'] = 'yes' + else: + row['response_is_right'] = 'no' + except: + pass + + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': row['domain_is_right'], + 'intent_is_right': row['intent_is_right'], + 'response_is_right': row['response_is_right'] + } + + else: + + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': "", + 'intent_is_right': "", + 'response_is_right': "" + } + + except Exception as e: + print("The error of getting generator in the module of sports_check():", e) + + + + + diff --git a/check/stocks_check.py b/check/stocks_check.py new file mode 100644 index 0000000000000000000000000000000000000000..ce621bdb3532a37c77975c1186ab25a979d77e08 --- /dev/null +++ b/check/stocks_check.py @@ -0,0 +1,93 @@ +# -*- coding: utf-8 -*- +# @Author: Gree +# @Date: 2021-06-02 14:38:22 +# @Last Modified by: Gree +# @Last Modified time: 2021-06-02 14:43:12 + + +import re +import pandas as pd + + +class StockCheck: + """股票语料自动化分类的检查""" + def stock_check(self, row): + """ + stock_check 函数: + input: + output: generator + features: 股票语料自动化分类的检查 + step1: 检查 stock 的语料字段 domain、intent、response_text 是否正确 ✅ + """ + # 获取 domain + domain = row['domain'] + # 获取 query + query = row['query'] + # 获取 intent + intent = row['intent'] + # 获取 response_text + response_text = row['response_text'] + + # 异常捕获 + try: + # 正则表达式匹配数据规律 + query_result = re.search(r'.*?(股|行情|上证|换手率|市盈率|成交|市值|指数|大盘|走势|科创板|收盘).*', query) + + except Exception as e: + print("The error of getting query_result in the module of stock_check():", e) + + # 异常捕获 + try: + # 条件判断 + if query_result is not None and '隔壁的' not in query and '天天唠' not in query and '墙头' not in query and '张文斌' not in query and 'Dubbing' not in query and '革命' not in query and '小编' not in query and '一个瓶子' not in query and '安静' not in query and '股份有限公司' not in query and '退出' not in query and '为了炒' not in query and '少佐' not in query and '乔飞' not in query and '一般' not in query: + row['domain_is_right'] = 'yes' + + if intent is not None: + row['intent_is_right'] = 'yes' + else: + row['reply_is_right'] = 'no' + + if response_text is not None and 'S.H.E' not in query: + row['response_is_right'] = 'yes' + else: + row['response_is_right'] = 'no' + + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': row['domain_is_right'], + 'intent_is_right': row['intent_is_right'], + 'response_is_right': row['response_is_right'] + } + + else: + + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': "", + 'intent_is_right': "", + 'response_is_right': "" + } + + except Exception as e: + print("The error of getting generator in the module of stock_check():", e) + + + + + diff --git a/check/translate_check.py b/check/translate_check.py new file mode 100644 index 0000000000000000000000000000000000000000..46c2cf410c6690d6c21bca5867e8bc1d89cfc9c0 --- /dev/null +++ b/check/translate_check.py @@ -0,0 +1,91 @@ +# -*- coding: utf-8 -*- +# @Author: Gree +# @Date: 2021-06-02 14:43:25 +# @Last Modified by: Gree +# @Last Modified time: 2021-06-02 14:55:02 + + +import re +import pandas as pd + + +class TranslateCheck: + """翻译语料自动化分类的检查""" + def translate_check(self, row): + """ + translate_check 函数: + input: + output: generator + features: 翻译语料自动化分类的检查 + step1: 检查 translate 的语料字段 domain、intent、response_text 是否正确 ✅ + """ + # 获取 domain + domain = row['domain'] + # 获取 query + query = row['query'] + # 获取 intent + intent = row['intent'] + # 获取 response_text + response_text = row['response_text'] + + # 异常捕获 + try: + # 正则表达式匹配数据规律 + query_result = re.search(r'.*?(翻译|英文怎么说|英语怎么说|什么意思|怎么说|怎么拼写|英文|中文|英语).*', query) + + except Exception as e: + print("The error of getting query_result in the module of translate_check():", e) + + # 异常捕获 + try: + # 条件判断 + if query_result is not None and '英语介绍' not in query and '英文介绍' not in query and '傻逼' not in query and '闭嘴' not in query and '小逼崽子' not in query and '操我' not in query and query != '听英文' and query != '你会英文吗' and query != '七六中文怎么说': + row['domain_is_right'] = 'yes' + + if 'translate' in intent: + row['intent_is_right'] = 'yes' + else: + row['reply_is_right'] = 'no' + + if response_text is not None and '1' not in response_text and '2' not in response_text and '5' not in response_text and '7' not in response_text and '8' not in response_text and response_text != 'Ah': + row['response_is_right'] = 'yes' + else: + row['response_is_right'] = 'no' + + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': row['domain_is_right'], + 'intent_is_right': row['intent_is_right'], + 'response_is_right': row['response_is_right'] + } + + else: + + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': "", + 'intent_is_right': "", + 'response_is_right': "" + } + + except Exception as e: + print("The error of getting generator in the module of translate_check():", e) + + + diff --git a/check/universal_control_check.py b/check/universal_control_check.py new file mode 100644 index 0000000000000000000000000000000000000000..46f70258c4c9a3702a8784646e9566e7ac8e783a --- /dev/null +++ b/check/universal_control_check.py @@ -0,0 +1,92 @@ +# -*- coding: utf-8 -*- +# @Author: Gree +# @Date: 2021-06-02 14:47:12 +# @Last Modified by: Gree +# @Last Modified time: 2021-06-02 14:54:58 + + +import re +import pandas as pd + + +class UniversalControlCheck: + """通用控制语料自动化分类的检查""" + def universal_control_check(self, row): + """ + universal_control_check 函数: + input: + output: generator + features: 通用控制语料自动化分类的检查 + step1: 检查 UniversalControl 的语料字段 domain、intent、response_text 是否正确 ✅ + """ + # 获取 domain + domain = row['domain'] + # 获取 query + query = row['query'] + # 获取 intent + intent = row['intent'] + # 获取 response_text + response_text = row['response_text'] + + # 异常捕获 + try: + # 正则表达式匹配数据规律 + query_result = re.search(r'.*?(声音大|音量|小一点|调高|调低|调大|调到|开机|设置24度|静音风|关掉啊|左右扫风|声音调|风速|最大风|小声点|升高|扫风|温度提高|温度调高|温度设|温度2|小点声|上下扫风|调小|大一点|灯光开|降低|关闭吧|有点热|调至|风量|开扫风|最低|最大|增大|关机|上下摇摆|降2|加大|减低|小一些|大些|下调|太冷了|低风|设至|左右风|档|摆风|显示|送风|启动吧|中速|结束吧|调为|强劲风|请停止|关闭吧|关了吧|灯|温度1|度|声音放小一点|关闭语音|语音关掉|小声一点|声音还是太大了|小声一点|声音小点|左右摇摆|声音最小|设为强风|声音再小点|请关闭|设置中风|上下风|给我中风|大声一点|声音减小|风力高风|大点声|给我关了|声音放到最小|减小声音|温度加|我觉得有点冷|帮我打开|左右摇摆|声音太大了|调成大风|风大点|自动风|有点冷|好冷啊|最小风|最高风|调节弱风|让屋里暖和|好热啊|帮我开启吧|静音模式|关掉|风小点|微风|想强风|吵死了|增加风|声大点|开下吹风|声音高一点|声音100分|再小声音|请关掉|大声音|开最小|低速风|低一点|声音放小点|声音低点|第六一集|加强风|最小声音|声音再提高|声音提高|太热了|声音放最小|启动啊|左右摆动|帮我关|请停掉).*', query) + + except Exception as e: + print("The error of getting query_result in the module of universal_control_check():", e) + + + # 异常捕获 + try: + # 条件判断 + if query_result is not None and '空调' not in query and '平安夜' not in query and '安徒生' not in query and '婴儿' not in query: + row['domain_is_right'] = 'yes' + + if 'control' in intent: + row['intent_is_right'] = 'yes' + else: + row['intent_is_right'] = 'no' + + row['response_is_right'] = 'yes' + + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': row['domain_is_right'], + 'intent_is_right': row['intent_is_right'], + 'response_is_right': row['response_is_right'] + } + + else: + + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': "", + 'intent_is_right': "", + 'response_is_right': "" + } + + except Exception as e: + print("The error of getting generator in the module of universal_control_check():", e) + + + + + + diff --git a/check/weather_check.py b/check/weather_check.py new file mode 100644 index 0000000000000000000000000000000000000000..d5b74f4ec5a7842b8dfbe29d39501c4984537f4e --- /dev/null +++ b/check/weather_check.py @@ -0,0 +1,88 @@ +# -*- coding: utf-8 -*- +# @Author: Gree +# @Date: 2021-06-02 14:52:47 +# @Last Modified by: Gree +# @Last Modified time: 2021-06-02 14:54:53 + + +import re +import pandas as pd + + +class WeatherCheck: + """天气语料自动化分类的检查""" + def weather_check(self, row): + """ + weather_check 函数: + input: + output: generator + features: 天气语料自动化分类的检查 + step1: 检查 weather 的语料字段 domain、intent、response_text 是否正确 ✅ + """ + # 获取 domain + domain = row['domain'] + # 获取 query + query = row['query'] + # 获取 intent + intent = row['intent'] + # 获取 response_text + response_text = row['response_text'] + + # 异常捕获 + try: + # 正则表达式匹配数据规律 + query_result = re.search(r'.*?(天气|气温|天气预报|有雨|温度|多少度|北京天|穿什么衣服|今天广州|热了啊|热不热|下雨|雾霾|带伞|预报|今天会雨天|今天几度|明天呢|一晴气温|现在外面温度|多云|有冷空气|晚上冷不冷).*', query) + + except Exception as e: + print("The error of getting query_result in the module of weather_check():", e) + + # 异常捕获 + try: + # 条件判断 + if query_result is not None and '首' not in query and '高' not in query and '低' not in query and '设为' not in query and '天气闷热' not in query and '升' not in query and '制冷' not in query and '温度2' not in query and '我想听' not in query and '给爷' not in query and '降' not in query and '有点儿凉' not in query and '直角' not in query and '把温度' not in query and '播放下雨' not in query and '温度加' not in query and '自动风' not in query and '停止' not in query: + row['domain_is_right'] = 'yes' + + if 'search' in intent: + row['intent_is_right'] = 'yes' + else: + row['intent_is_right'] = 'no' + + if response_text is not None and query != '周六的天气空调周六的天气': + row['response_is_right'] = 'yes' + else: + row['response_is_right'] = 'no' + + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': row['domain_is_right'], + 'intent_is_right': row['intent_is_right'], + 'response_is_right': row['response_is_right'] + } + + else: + + # 生成器 + yield { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': query, + 'domain': domain, + 'intent': intent, + 'response_text': response_text, + 'domain_is_right': "", + 'intent_is_right': "", + 'response_is_right': "" + } + + except Exception as e: + print("The error of getting generator in the module of weather_check():", e) diff --git a/config/__init__.py b/config/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/config/conn_sql.py b/config/conn_sql.py new file mode 100644 index 0000000000000000000000000000000000000000..4933defc3630aa907741bb462932b6be22c53514 --- /dev/null +++ b/config/conn_sql.py @@ -0,0 +1,304 @@ +# -*- coding: utf-8 -*- +# @Author: Gree +# @Date: 2021-05-31 18:26:52 +# @Last Modified by: Gree +# @Last Modified time: 2021-06-01 14:19:52 + + +import os +import re +import configparser +from datetime import datetime +import pymysql + + +class ConnSql(object): + """ + 1. 构建一个连接数据库的父类:读取配置文件,连接数据库。 + """ + def __init__(self): + """ + __init__ 函数: + input: + output: + features: 定义初始变量 + step1: 定义 self.dirName、self.fileName、self.cf 等对象 + """ + # 获取 self.dirName + self.dirName = os.path.split(os.path.realpath(__file__))[0] + # 获取 self.fileName + self.fileName = os.path.join(self.dirName, "sql.conf") + # 生成 cf 对象 + self.cf = configparser.ConfigParser() + + + def conn_sql(self): + """ + conn_sql 函数: + input: + output: conn + features: 读取 sql.conf 配置文件,连接数据库 + step1: 读取 sql.conf 配置文件,并利用相关信息,连接数据库 + step2: 返回 conn 对象 + """ + # 读取 sql.conf 配置文件 + self.cf.read(self.fileName) + + # 异常捕获 + try: + # 连接数据库 + conn = pymysql.connect( + host = str(self.cf.get("log_on", "host")), + port = int(self.cf.get("log_on", "port")), + user = str(self.cf.get("log_on", "user")), + passwd = str(self.cf.get("log_on", 'passwd')), + db = str(self.cf.get("log_on", "db")) + ) + # 输出 log 信息 + print("Database connection is successful!") + # 返回 conn 对象 + return conn + + except Exception as e: + print("The error of conn_sql():", e) + + +class DbRun(ConnSql): + """ + 1. 对于父类 ConnSql 的继承,继承了父类的属性和方法,并拥有自己的方法。 + 2. DbRun 类主要进行数据库的操作。 + """ + def table_exists(self, table_name, sql): + """ + table_exists 函数: + input: table_name, sql + output: True/False + features: 该方法用来判断我们所要的表是否存在当前数据库中 + step1: 判断我们所要的表是否存在当前数据库中 + step2: return True/False + """ + # 输出进入模块的 log 信息 + print("Loading the module of table_exists ...") + + # 调用父类方法获取 conn 对象 + conn = self.conn_sql() + + # 捕获异常 + try: + # 构建 cursor 对象 + with conn.cursor() as cursor: + # 执行 sql 语句 + cursor.execute(sql) + # 获取 tables + tables = [cursor.fetchall()] + # 获取 table_list + table_list = re.findall('(\'.*?\')', str(tables)) + # 数据清洗 + table_list = [re.sub("'",'',each) for each in table_list] + # 条件判断 + if table_name in table_list: + # 输出 log 信息 + print("The table of %s is exists!" % table_name) + # 存在返回 True + return True + else: + # 输出 log 信息 + print("The table of %s is not exists!" % table_name) + # 不存在返回 False + return False + + except Exception as e: + print("The error of table_exists:", e) + + finally: + # 关闭 conn + conn.close() + # 输出 log 信息 + print("The dealing of table_exists finished!") + + + + def new_table(self, table_name, sql): + """ + new_table 函数: + input: table_name, sql + output: + features: 在 mysql 新建一张数据表 + step1: 在 mysql 新建一张数据表 + """ + # 输出进入模块的 log 信息 + print("Loading the module of new_table ...") + + # 调用父类方法获取 conn 对象 + conn = self.conn_sql() + + # 捕获异常 + try: + """ + 1. sql 语句新建一个所需要字段的 data sheet。 + """ + # 构建 cursor 对象 + with conn.cursor() as cursor: + # 执行 sql 语句 + cursor.execute(sql) + # 事务的手动提交 + conn.commit() + # 输出执行 sql 语句的 log 信息 + print("Data sheet of %s is established!" % table_name) + + except Exception as e: + # 输出 log 信息 + print("The error of new_table:", e) + # 事务回滚 + conn.rollback() + + finally: + # 关闭 conn + conn.close() + # 输出 log 信息 + print("The dealing of new_table finished!") + + + def initial_data(self, sql): + """ + initial_data 函数: + input: sql + output: result + features: 获取需要进行自动化标注的最原始数据 + step1: 从相关数据表中获取相应数据 + """ + # 输出 log 信息 + print("Loading the module of initial_data ...") + + # 调用父类方法获取 conn 对象 + conn = self.conn_sql() + + # 进行异常捕获 + try: + # 构建 cursor 对象 + with conn.cursor() as cursor: + # 执行 sql + cursor.execute(sql) + # 获取数据 + result = cursor.fetchall() + + except Exception as e: + print("The error of initial_data(): ", e) + + else: + # 输出 log 日志 + print("Get initial data successfully!") + # 返回 result + return result + + finally: + # 关闭 conn + conn.close() + # 输出 log 信息 + print("Exiting the module of initial_data ...") + + + def insert_data(self, sql, df): + """ + insert_data 函数: + input: sql, df + output: 数据插入 mysql + features: 将自动化分类的最终数据插入相应的数据表中 + step1: 数据插入 + """ + # 输出 log 信息 + print("Loading the module of insert_data ...") + + # 获取插入 sql 语句的各个字段,用于批量插入数据 + sql_lst = [] + + # 进行异常捕获 + try: + # 遍历获取待插入数据 + for i in range(0, len(df)): + # 将时间类型转为字符串 + time = datetime.strftime(df.iloc[i, 0], "%Y-%m-%d %H:%M:%S") + # truple + sql_truple = (time, df.iloc[i, 1], df.iloc[i, 2], df.iloc[i, 3], df.iloc[i, 4], df.iloc[i, 5], df.iloc[i, 6], df.iloc[i, 7], df.iloc[i, 8], df.iloc[i, 9], df.iloc[i, 10]) + # 列表添加元素 + sql_lst.append(sql_truple) + + except Exception as e: + print("The error of getting sql_lst: ", e) + + # 进行异常捕获 + try: + # sql 语句 + # sql = "INSERT INTO " + table_name + " (date_time, request_id, mac_wifi, user_id, query, domain, intent, response_text, domain_is_right, intent_is_right, response_is_right) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" + # 调用父类方法获取 conn 对象 + conn = self.conn_sql() + # 条件判断 + if len(sql_lst) > 100000: + # 获取 num + num = len(sql_lst) // 100000 + # 定义 start + start = 0 + # 定义 end + end = 0 + # for 循环 + for i in range(0, num): + # 累加 + end += 100000 + # 切片操作 + insert_data = sql_lst[start:end] + # 输出 log 信息 + print("One hundred thousand data is inserting ...") + # ping()方法,该方法默认的有个 reconnect 参数,默认是 True,如果失去连接了会重连 + conn.ping() + # 获取 cursor + cursor = conn.cursor() + # 批量插入数据 + cursor.executemany(sql, insert_data) + # 事务的手动提交 + conn.commit() + # 累加 + start += 100000 + + # 条件判断 + if end < len(sql_lst): + # 切片操作 + insert_data = sql_lst[end:len(sql_lst)] + # 输出 log 信息 + print("The last part of data is inserting ...") + # ping()方法,该方法默认的有个 reconnect 参数,默认是 True,如果失去连接了会重连 + conn.ping() + # 获取 cursor + cursor = conn.cursor() + # 批量插入数据 + cursor.executemany(sql, insert_data) + # 事务的手动提交 + conn.commit() + + else: + pass + else: + # 输出 log 信息 + print("Data is inserting ...") + # ping()方法,该方法默认的有个 reconnect 参数,默认是 True,如果失去连接了会重连 + conn.ping() + # 获取 cursor + cursor = conn.cursor() + # 批量插入数据 + cursor.executemany(sql, sql_lst) + # 事务的手动提交 + conn.commit() + # 输出 log 日志 + print('The inserting of data is finished!') + + except Exception as e: + print("The error of insert_data(): ", e) + + finally: + # 关闭 conn + conn.close() + # 输出 log 信息 + print("Exiting the module of insert_data ...") + + + + diff --git a/config/sql.conf b/config/sql.conf new file mode 100644 index 0000000000000000000000000000000000000000..9f227e73ab81b66a3520820e75715f3dad321102 --- /dev/null +++ b/config/sql.conf @@ -0,0 +1,10 @@ +[log_on] +host = yourhost +port = 3306 +user = test +passwd = yourpasswd +db = semantic_data_analyze + + + + diff --git a/data_processing/__init__.py b/data_processing/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/data_processing/data_deal.py b/data_processing/data_deal.py new file mode 100644 index 0000000000000000000000000000000000000000..fea191b84557b54ece6e59ceeb5b15d604f81689 --- /dev/null +++ b/data_processing/data_deal.py @@ -0,0 +1,66 @@ +# -*- coding: utf-8 -*- +# @Author: Gree +# @Date: 2020-12-17 15:57:05 +# @Last Modified by: Gree +# @Last Modified time: 2021-06-01 14:53:06 + + +import pandas as pd +from config import conn_sql as cs +from period import get_time as gt + + +class DataDeal(object): + """ + 1. DataDeal 类主要用于获取所需要操作的数据框:原始数据框 + """ + def initial_df(self, initial_data): + """ + initial_df 函数: + input: initial_data + output: df_result + features: 初始化数据框 + step1: 数据框操作,数据去重、排序等操作 + step2: 返回 df_result + """ + # 输出进入 initial_df 模块的 log 信息 + print("Loading the module of initial_df ...") + # 捕获异常 + try: + # 数据框操作 + df_result = pd.DataFrame(list(initial_data), columns=["date_time", "request_id", "mac_wifi", "user_id", "query", "domain", "intent", "response_text"]) + # 新增 domain_is_right 列 + df_result['domain_is_right'] = '' + # 新增 intent_is_right 列 + df_result['intent_is_right'] = '' + # 新增 response_is_right 列 + df_result['response_is_right'] = '' + # 按 date_time 排序 + df_result = df_result.sort_values(["date_time"], ascending = False) + # 输出 log 信息 + print("The dimension of initial dataframe before duplicated: ", end = "") + # 去重前的数据框维度 + print(df_result.shape) + # query 列数据去重 + df_result = df_result.drop_duplicates(subset = "query") + # 按 date_time 排序 + df_result = df_result.sort_values(["date_time"], ascending = True) + # 输出 log 信息 + print("The dimension of initial dataframe: ", end = "") + # 输出当前数据框的维度(去重后) + print(df_result.shape) + # 返回 df_result + return df_result + + except Exception as e: + print("The error of initial_df(): ", e) + + else: + # 返回 df_result + return df_result + + finally: + # 输出退出 initial_df 模块的 log 信息 + print("Exiting the module of initial_df ...") + + diff --git a/main.py b/main.py new file mode 100644 index 0000000000000000000000000000000000000000..da419d085da15668b5814dea9ad4a559cd57db51 --- /dev/null +++ b/main.py @@ -0,0 +1,257 @@ +# -*- coding: utf-8 -*- +# @Author: Gree +# @Date: 2021-05-31 18:18:57 +# @Last Modified by: Gree +# @Last Modified time: 2021-06-02 16:42:41 + + +import re +import configparser + +# 第三方库 +import pandas as pd +from config import conn_sql as cs +from data_processing import data_deal as dd +from period import get_time as gt +from check import airconditioner_check as ac +from check import ancient_poem_check as apc +from check import fm_check as fc +from check import global_control_check as gcc +from check import holiday_check as hc +from check import music_check as mc +from check import news_check as nc +from check import play_control_check as pcc +from check import science_check as sc +from check import sports_check as spc +from check import stocks_check as stc +from check import translate_check as tc +from check import universal_control_check as ucc +from check import weather_check as wc +from check import encyclopedia_check as ec +from check import chat_check as cc + + +class AutoTest: + def __init__(self): + """ + __init__ 函数: + input: + output: + features: 定义初始变量 + step1: 定义 self.time_before、self.time_now、self.initial_table、self.insert_table 等对象 + """ + # 获取 self.time_before, self.time_now + self.time_before, self.time_now = gt.GetTime().get_time(days = 0) + # 原始数据表 + self.initial_table = "ctoc_tb" + # 插入数据表 + self.insert_table = "final" + # sql 语句 + self.sql_1 = "show tables;" + # sql 语句 + self.sql_2 = "select date_time, request_id, mac_wifi, user_id, query, domain, intent, response_text from %s" % self.initial_table if self.time_before == "" else "select date_time, request_id, mac_wifi, user_id, query, domain, intent, response_text from %s where date_time BETWEEN %s and %s" % (self.initial_table, self.time_before, self.time_now) + # sql 语句 + self.sql_3 = "CREATE TABLE %s (`id` INT PRIMARY KEY AUTO_INCREMENT,`date_time` datetime DEFAULT NULL,`request_id` varchar(50) DEFAULT NULL,`mac_wifi` varchar(50) DEFAULT NULL,`user_id` varchar(50) DEFAULT NULL,`query` varchar(255) DEFAULT NULL,`domain` varchar(50) DEFAULT NULL,`intent` varchar(50) DEFAULT NULL,`response_text` text DEFAULT NULL,`domain_is_right` varchar(50) DEFAULT NULL,`intent_is_right` varchar(50) DEFAULT NULL,`response_is_right` varchar(50) DEFAULT NULL) CHARSET=utf8;" % self.insert_table + # sql 语句 + self.sql_4 = "INSERT INTO " + self.insert_table + " (date_time, request_id, mac_wifi, user_id, query, domain, intent, response_text, domain_is_right, intent_is_right, response_is_right) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" + # 定义 self.result_list + self.result_list = [] + + def integrate_data(self, df): + """ + integrate_data 函数: + input: df + output: df + features: 模块的作用是整合数据 + """ + # 输出 log 信息 + print("Loading the module of IntegrateData ...") + # 定义 input_df + input_df = df.copy() + # 遍历 + for index, row in input_df.iterrows(): + # 获取 domain + domain = row['domain'] + # 条件判断 + if domain == "Airconditioner": + # 遍历 + for item in ac.AirconditionerCheck().airconditioner_check(row): + # 列表添加元素 + self.result_list.append(item) + + elif domain == "ancient_poem": + # 遍历 + for item in apc.AncientPoemCheck().ancient_poem_check(row): + # 列表添加元素 + self.result_list.append(item) + + elif domain == "fm": + # 遍历 + for item in fc.FmCheck().fm_check(row): + # 列表添加元素 + self.result_list.append(item) + + elif domain == "globalctrl": + # 遍历 + for item in gcc.GlobalControlCheck().global_control_check(row): + # 列表添加元素 + self.result_list.append(item) + + elif domain == "holiday": + # 遍历 + for item in hc.HolidayCheck().holiday_check(row): + # 列表添加元素 + self.result_list.append(item) + + elif domain == "music": + # 遍历 + for item in mc.MusicCheck().music_check(row): + # 列表添加元素 + self.result_list.append(item) + + elif domain == "news": + # 遍历 + for item in nc.NewsCheck().news_check(row): + # 列表添加元素 + self.result_list.append(item) + + elif domain == "PlayControl": + # 遍历 + for item in pcc.PlayControlCheck().play_control_check(row): + # 列表添加元素 + self.result_list.append(item) + + elif domain == "science": + # 遍历 + for item in sc.ScienceCheck().science_check(row): + # 列表添加元素 + self.result_list.append(item) + + elif domain == "sports": + # 遍历 + for item in spc.SportsCheck().sports_check(row): + # 列表添加元素 + self.result_list.append(item) + + elif domain == "stock": + # 遍历 + for item in stc.StockCheck().stock_check(row): + # 列表添加元素 + self.result_list.append(item) + + elif domain == "translate": + # 遍历 + for item in tc.TranslateCheck().translate_check(row): + # 列表添加元素 + self.result_list.append(item) + + elif domain == "UniversalControl": + # 遍历 + for item in ucc.UniversalControlCheck().universal_control_check(row): + # 列表添加元素 + self.result_list.append(item) + + elif domain == "weather": + # 遍历 + for item in wc.WeatherCheck().weather_check(row): + # 列表添加元素 + self.result_list.append(item) + + elif domain == "baike": + # 遍历 + for item in ec.EncyclopediaCheck().encyclopedia_check(row): + # 列表添加元素 + self.result_list.append(item) + + elif domain == "chat": + # 遍历 + for item in cc.ChatCheck().chat_check(row): + # 列表添加元素 + self.result_list.append(item) + + else: + # 获取 item + item = { + 'date_time': row['date_time'], + 'request_id': row['request_id'], + 'mac_wifi': row['mac_wifi'], + 'user_id': row['user_id'], + 'query': row['query'], + 'domain': domain, + 'intent': row['intent'], + 'response_text': row['response_text'], + 'domain_is_right': "", + 'intent_is_right': "", + 'response_is_right': "" + } + # 列表添加元素 + self.result_list.append(item) + + # 捕获异常 + try: + # 将列表转为数据框 + df = pd.DataFrame(self.result_list, columns = ['date_time', 'request_id', 'mac_wifi', 'user_id', 'query', 'domain', 'intent', 'response_text', 'domain_is_right', 'intent_is_right', 'response_is_right']) + # 数据清洗 + df = df.astype(object).where(pd.notnull(df), '') + # 数据排序 + df = df.sort_values(["date_time"], ascending = True) + # 输出 log 信息 + print('The dimension of final dataframe:', end = '') + # 输出当前数据框的维度 + print(df.shape) + # 返回 df + return df + + except Exception as e: + print("The error of integrate_data():", e) + + finally: + print("Exiting the module of IntegrateData ...") + + + def main(self): + """ + main 函数: + input: + output: 数据持久化 + features: 将验证过的分类数据重新插入数据库 + step1: 连接数据库,获取 initial_data + step2: 数据清洗, 获取 initial_df + step3: 判断插入数据表是否存在,不存在则新建数据表 + step4: 数据插入 + """ + + # 实例化数据库操作类 + DbRun = cs.DbRun() + # 条件判断 + if not DbRun.table_exists(self.initial_table, self.sql_1): + # return + return + # 如果 initial_table 存在,获取 initial_table 中相关字段的信息 + initial_data = DbRun.initial_data(self.sql_2) + # 实例化一个数据框操作类 + DataDeal = dd.DataDeal() + # 将获取到的 initial_data 数据进行数据框操作,得到原始的 initial_df 数据框 + initial_df = DataDeal.initial_df(initial_data) + # 将原始的 initial_df 数据框中的数据进行自动化分类检查,返回一个 output_df 数据框 + output_df = self.integrate_data(initial_df) + # 条件判断 + if not DbRun.table_exists(self.insert_table, self.sql_1): + # 如果 insert_table 不存在,则新建 insert_table + DbRun.new_table(self.insert_table, self.sql_3) + # 将 output_df 数据框中数据批量插入 insert_table + DbRun.insert_data(self.sql_4, output_df) + else: + # 将 output_df 数据框中数据批量插入 insert_table + DbRun.insert_data(self.sql_4, output_df) + + +if __name__ == '__main__': + autoTest = AutoTest() + autoTest.main() + + + + + + diff --git a/period/__init__.py b/period/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/period/get_time.py b/period/get_time.py new file mode 100644 index 0000000000000000000000000000000000000000..99768046099391529f5ffe396107b81caf4d6da1 --- /dev/null +++ b/period/get_time.py @@ -0,0 +1,107 @@ +# -*- coding: utf-8 -*- +# @Author: Gree +# @Date: 2020-12-17 13:39:53 +# @Last Modified by: Gree +# @Last Modified time: 2021-06-01 14:28:16 + + +import datetime + + +class GetTime(object): + """ + 1. 该类主要用于得到目前时间,然后获取一周前、一个月前的时间点,根据前时间节点到现今的时间节点时间段, + 再通过 conn_sql.py 中 DbRun 类中的方法获取原始数据。 + """ + def __init__(self): + """ + __init__ 函数: + input: + output: + features: 定义初始变量 + step1: 定义 self.time_now_timestamp、self.time_mid_timestamp 等对象 + """ + # 获取 self.time_now_timestamp + self.time_now_timestamp = datetime.datetime.now() + # 获取 self.time_mid_timestamp + self.time_mid_timestamp = self.time_now_timestamp - datetime.timedelta(days=250) + + def get_time(self, days = 0): + """ + get_time 函数: + input: days = 0(形参默认值) + output: time_before, time_now + features: 该方法主要用于获取所需要的前时间节点,比如一个星期之前,或者一个月之前,形参默认值都为0 + step1: 该方法主要用于获取所需要的前时间节点,比如一个星期之前,或者一个月之前,形参默认值都为0 + step2: 如果该方法不传参数,采用默认值,则 time_before 和 time_now 都为空字符串 + """ + # 输出 log 信息 + print("Loading the module of get_time ...") + # 捕获异常 + try: + # 条件判断 + if days == 0: + # 赋值 + time_before = '' + # 赋值 + time_now = '' + + else: + # 获取 time_before_timestamp + time_before_timestamp = self.time_now_timestamp - datetime.timedelta(days = days) + # 获取 time_now + time_now = self.time_now_timestamp.strftime("%Y-%m-%d %H:%M:%S") + # 获取 time_before + time_before = time_before_timestamp.strftime("%Y-%m-%d %H:%M:%S") + # 获取 time_now + time_now = "\'" + time_now + "\'" + # 获取 time_before + time_before = "\'" + time_before + "\'" + + except Exception as e: + print("The error of get_time():", e) + + else: + # 输出 log 信息 + print("The information of time is gained!") + # 返回 time_before, time_now + return time_before, time_now + + finally: + # 输出 log 信息 + print("Exiting the module of get_time ...") + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/requirement.txt b/requirement.txt new file mode 100644 index 0000000000000000000000000000000000000000..d4df7f8e24edf8bd62a5d9bc4710eb020fc489f0 --- /dev/null +++ b/requirement.txt @@ -0,0 +1,60 @@ +absl-py==0.9.0 +APScheduler==3.6.3 +astor==0.8.1 +certifi==2020.4.5.1 +cffi==1.14.0 +chardet==3.0.4 +configparser==4.0.2 +cryptography==2.9.2 +DBUtils==1.3 +Django==2.1 +django-apscheduler==0.3.0 +django-cors-headers==3.2.1 +django-database-pool==0.0.1 +elasticsearch==7.1.0 +gast==0.2.2 +google-pasta==0.2.0 +grpcio==1.30.0 +h5py==2.10.0 +idna==2.9 +importlib-metadata==1.6.1 +jieba==0.42.1 +joblib==0.15.1 +jsonformatter==0.2.3 +Keras==2.3.1 +Keras-Applications==1.0.8 +Keras-Preprocessing==1.1.2 +Markdown==3.2.2 +mock==4.0.2 +mysqlclient==1.4.6 +numpy==1.16.0 +opt-einsum==3.2.1 +pandas==1.0.1 +protobuf==3.12.2 +public==2019.4.13 +pycparser==2.20 +PyJWT==1.7.1 +PyMySQL==0.9.3 +python-consul==1.1.0 +python-dateutil==2.8.1 +pytz==2019.3 +PyYAML==5.3.1 +query-string==2019.4.13 +requests==2.23.0 +schedule==0.6.0 +scikit-learn==0.23.1 +scipy==1.5.0 +six==1.14.0 +sklearn==0.0 +sqlparse==0.3.0 +tensorboard==1.13.1 +tensorflow==1.13.1 +tensorflow-estimator==1.13.0 +termcolor==1.1.0 +threadpoolctl==2.1.0 +tzlocal==2.0.0 +urllib3==1.25.7 +Werkzeug==1.0.1 +wrapt==1.12.1 +xlrd==1.2.0 +zipp==3.1.0