# -*- coding: utf-8 -*- # @Author: StudentCWZ # @Date: 2020-11-30 12:27:58 # @Last Modified by: Gree # @Last Modified time: 2020-12-18 15:23:29 import re import pandas as pd def FmCheck(input_df): """ 模块功能:检查fm类的语料字段domain、intent、response_text是否正确 iterrows: 返回值为元组,(index,row) """ print('The module of fm_check is running!') for index, row in input_df.iterrows(): query = row['query'] domain = row['domain'] intent = row['intent'] response_text = row['response_text'] try: # 正则表达式匹配数据规律 query_result = re.search(r'.*?(故事|白雪公主的|白雪公主讲|评书|小品|相声|京剧|美文之声|广播|笑林|黑猫警长|奥特曼|葫芦娃|fm|星辰变|童话格林|卖火柴|下一个三字经|水浒|频道|西游记|西厢记|小说|主播|个台|下一集|下一段|下一章|下一张|之声|收听|兆|FM|格林童话|戏曲|三字经|冰雪奇缘|京戏|郭德纲|集|丑小鸭|阿拉丁|米小圈|萌鸡小队|虾球传|经济杂谈|调频|亿万老婆买一送一|梁祝|袁阔成|三国演义|小猪佩奇|黄梅戏|电台|冬吴同学会|猪猪侠|游园惊梦|睡前故事|电视剧|青华浮梦|安徒生|二人转|下一节|脱口秀|海底小纵队).*', query) if query_result is not None and '关' not in query and '体操' not in query and '爱情故事' not in query and '建成了' not in query and '丰田' not in query and '配置' not in query and '研究' not in query and '走私' not in query and '黑锅' not in query and '奇异' not in query and '会议' not in query and '贞操' not in query and '你妹啊' not in query and '音量' not in query and '语音' not in query and '停止' not in query and query != '不听故事' and query != '唱的故事' and query != '请找私有制的故事' and query != '播放来的故事' and query != '应该剪个两只小老虎的故事': if domain == 'fm': row['domain_is_right'] = 'yes' if intent is not None: row['intent_is_right'] = 'yes' else: row['intent_is_right'] = 'no' try: if response_text is None or '抱歉' in response_text or '绘本' in response_text or '推荐那些事' in response_text or '跳泥坑' in response_text or '叮当头条' in response_text or '保持独立' in response_text or '来读诗' in response_text or '来听听男一号吧' in response_text or '推荐你听听最近很火的湖北传统采茶戏' in response_text or '魔镜魔镜告诉我' in response_text: row['response_is_right'] = 'no' else: row['response_is_right'] = 'yes' except: pass yield { # 'initial_id': row['id'], 'date_time': row['date_time'], 'request_id': row['request_id'], 'mac_wifi': row['mac_wifi'], 'user_id': row['user_id'], 'query': query, 'domain': domain, 'intent': intent, 'response_text': response_text, 'domain_is_right': row['domain_is_right'], 'intent_is_right': row['intent_is_right'], 'response_is_right': row['response_is_right'] } else: pass else: pass except: pass print('The module of fm_check is executed!')