# -*- coding: utf-8 -*- # @Author: Gree # @Date: 2021-06-02 14:35:10 # @Last Modified by: Gree # @Last Modified time: 2021-06-02 14:38:01 import re import pandas as pd class SportsCheck: """体育语料自动化分类的检查""" def sports_check(self, row): """ sports_check 函数: input: output: generator features: 体育语料自动化分类的检查 step1: 检查 sports 的语料字段 domain、intent、response_text 是否正确 ✅ """ # 获取 domain domain = row['domain'] # 获取 query query = row['query'] # 获取 intent intent = row['intent'] # 获取 response_text response_text = row['response_text'] # 捕获异常 try: # 正则表达式匹配数据规律 query_result = re.search(r'.*?(赛事|比赛|nba|cba|中超|詹姆斯|科比|乔丹|国安|英超|切尔西|对阵|阿森纳|塞尔利亚人|冠军|足球|湖人|决赛|巴斯坦人|广州恒大|女排|火箭|联赛|皇马|梅西|雷霆|拜仁|曼联|奥运会|森林狼|凯尔特人|世界杯|欧洲杯|公牛队|活塞队|老鹰|步行者|尤文图斯|赢了|比分|阿斯特拉|多少分|杜兰特|篮网|热火|猛龙|灰熊|欧联杯|骑士|热刺|对手|皇家马德里|队|巴萨|美洲杯|利物浦|vs|ac米兰).*', query) except Exception as e: print("The error of getting query_result in the module of sports_check():", e) # 捕获异常 try: # 条件判断 if query_result is not None: row['domain_is_right'] = 'yes' if 'search' in intent: row['intent_is_right'] = 'yes' else: row['reply_is_right'] = 'no' try: if response_text is not None: row['response_is_right'] = 'yes' else: row['response_is_right'] = 'no' except: pass # 生成器 yield { 'date_time': row['date_time'], 'request_id': row['request_id'], 'mac_wifi': row['mac_wifi'], 'user_id': row['user_id'], 'query': query, 'domain': domain, 'intent': intent, 'response_text': response_text, 'domain_is_right': row['domain_is_right'], 'intent_is_right': row['intent_is_right'], 'response_is_right': row['response_is_right'] } else: # 生成器 yield { 'date_time': row['date_time'], 'request_id': row['request_id'], 'mac_wifi': row['mac_wifi'], 'user_id': row['user_id'], 'query': query, 'domain': domain, 'intent': intent, 'response_text': response_text, 'domain_is_right': "", 'intent_is_right': "", 'response_is_right': "" } except Exception as e: print("The error of getting generator in the module of sports_check():", e)