science_check.py 2.99 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
# -*- coding: utf-8 -*-
# @Author: Gree
# @Date:   2021-06-02 14:31:12
# @Last Modified by:   Gree
# @Last Modified time: 2021-06-02 14:34:55


import re
import pandas as pd


class ScienceCheck:
    """科学语料自动化分类的检查"""
    def science_check(self, row):
        """
        science_check 函数:
        input:
        output: generator
        features: 科学语料自动化分类的检查
        step1: 检查 science 的语料字段 domain、intent、response_text 是否正确 ✅
        """
        # 获取 domain
        domain = row['domain']
        # 获取 query
        query = row['query']
        # 获取 intent
        intent = row['intent']
        # 获取 response_text
        response_text = row['response_text']

        # 异常捕获
        try:
            # 正则表达式匹配数据规律
            query_result = re.search(r'.*?(加|减|乘|除|等于|次方|光年|换算|长度).*', query)

        except Exception as e:
            print("The error of getting query_result in the module of science_check():", e)

        # 异常捕获
        try:
            # 条件判断
            if query_result is not None and '站在' not in query:
                row['domain_is_right'] = 'yes'

                if 'calculator' in intent or 'unit_' in intent:
                    row['intent_is_right'] = 'yes'
                else:
                    row['intent_is_right'] = 'no'


                if ('等于' in response_text or '单位' in response_text or '做不出来' in response_text) and '24乘以50384乘以59等于' not in query:
                    row['response_is_right'] = 'yes'
                else:
                    row['response_is_right'] = 'no'

                # 生成器
                yield {
                    'date_time': row['date_time'],
                    'request_id': row['request_id'],
                    'mac_wifi': row['mac_wifi'],
                    'user_id': row['user_id'],
                    'query': query,
                    'domain': domain,
                    'intent': intent,
                    'response_text': response_text,
                    'domain_is_right': row['domain_is_right'],
                    'intent_is_right': row['intent_is_right'],
                    'response_is_right': row['response_is_right']
                }

            else:

                # 生成器
                yield {
                    'date_time': row['date_time'],
                    'request_id': row['request_id'],
                    'mac_wifi': row['mac_wifi'],
                    'user_id': row['user_id'],
                    'query': query,
                    'domain': domain,
                    'intent': intent,
                    'response_text': response_text,
                    'domain_is_right': "",
                    'intent_is_right': "",
                    'response_is_right': ""
                }


        except Exception as e:
            print("The error of getting generator in the module of science_check():", e)