translate_check.py 3.32 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91
# -*- coding: utf-8 -*-
# @Author: Gree
# @Date:   2021-06-02 14:43:25
# @Last Modified by:   Gree
# @Last Modified time: 2021-06-02 14:55:02


import re
import pandas as pd


class TranslateCheck:
    """翻译语料自动化分类的检查"""
    def translate_check(self, row):
        """
        translate_check 函数:
        input:
        output: generator
        features: 翻译语料自动化分类的检查
        step1: 检查 translate 的语料字段 domain、intent、response_text 是否正确 ✅
        """
        # 获取 domain
        domain = row['domain']
        # 获取 query
        query = row['query']
        # 获取 intent
        intent = row['intent']
        # 获取 response_text
        response_text = row['response_text']

        # 异常捕获
        try:
            # 正则表达式匹配数据规律
            query_result = re.search(r'.*?(翻译|英文怎么说|英语怎么说|什么意思|怎么说|怎么拼写|英文|中文|英语).*', query)

        except Exception as e:
            print("The error of getting query_result in the module of translate_check():", e)

        # 异常捕获
        try:
            # 条件判断
            if query_result is not None  and '英语介绍' not in query and '英文介绍' not in query and '傻逼' not in query and '闭嘴' not in query and '小逼崽子' not in query and '操我' not in query and query != '听英文' and query != '你会英文吗' and query != '七六中文怎么说':
                row['domain_is_right'] = 'yes'

                if 'translate' in intent:
                    row['intent_is_right'] = 'yes'
                else:
                    row['reply_is_right'] = 'no'

                if response_text is not None and '1' not in response_text and '2' not in response_text and '5' not in response_text and '7' not in response_text and '8' not in response_text and response_text != 'Ah':
                    row['response_is_right'] = 'yes'
                else:
                    row['response_is_right'] = 'no'

                # 生成器
                yield {
                    'date_time': row['date_time'],
                    'request_id': row['request_id'],
                    'mac_wifi': row['mac_wifi'],
                    'user_id': row['user_id'],
                    'query': query,
                    'domain': domain,
                    'intent': intent,
                    'response_text': response_text,
                    'domain_is_right': row['domain_is_right'],
                    'intent_is_right': row['intent_is_right'],
                    'response_is_right': row['response_is_right']
                }

            else:

                # 生成器
                yield {
                    'date_time': row['date_time'],
                    'request_id': row['request_id'],
                    'mac_wifi': row['mac_wifi'],
                    'user_id': row['user_id'],
                    'query': query,
                    'domain': domain,
                    'intent': intent,
                    'response_text': response_text,
                    'domain_is_right': "",
                    'intent_is_right': "",
                    'response_is_right': ""
                }

        except Exception as e:
            print("The error of getting generator in the module of translate_check():", e)