# -*- coding: utf-8 -*- # @Author: StudentCWZ # @Date: 2020-11-30 12:53:14 # @Last Modified by: Gree # @Last Modified time: 2020-12-18 15:34:03 import re import pandas as pd def NewsCheck(input_df): """ 模块功能:检查news类的语料字段domain、intent、response_text是否正确 iterrows: 返回值为元组,(index,row) """ print('The module of news_check is running!') for index, row in input_df.iterrows(): query = row['query'] domain = row['domain'] intent = row['intent'] response_text = row['response_text'] try: # 正则表达式匹配数据规律 query_result = re.search(r'.*?(新闻|今日头条).*', query) if query_result is not None and '等于' not in query and '周杰伦' not in query and '制冷' not in query and query != '我要听最新' and query != '打开新闻' and query != '我要听热门' and query != '格力空调读新闻': if domain == 'news': row['domain_is_right'] = 'yes' if 'search' in intent: row['intent_is_right'] = 'yes' else: row['intent_is_right'] = 'no' try: if response_text is not None: row['response_is_right'] = 'yes' else: row['response_is_right'] = 'no' except: pass yield { # 'initial_id': row['id'], 'date_time': row['date_time'], 'request_id': row['request_id'], 'mac_wifi': row['mac_wifi'], 'user_id': row['user_id'], 'query': query, 'domain': domain, 'intent': intent, 'response_text': response_text, 'domain_is_right': row['domain_is_right'], 'intent_is_right': row['intent_is_right'], 'response_is_right': row['response_is_right'] } else: pass else: pass except: pass print('The module of news_check is executed!')