data_deal.py 2.26 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
# -*- coding: utf-8 -*-
# @Author: Gree
# @Date:   2020-12-17 15:57:05
# @Last Modified by:   Gree
# @Last Modified time: 2021-06-01 14:53:06


import pandas as pd
from config import conn_sql as cs
from period import get_time as gt


class DataDeal(object):
    """
    1. DataDeal 类主要用于获取所需要操作的数据框:原始数据框
    """
    def initial_df(self, initial_data):
        """
        initial_df 函数:
        input: initial_data
        output: df_result
        features: 初始化数据框
        step1: 数据框操作,数据去重、排序等操作
        step2: 返回 df_result
        """
        # 输出进入 initial_df 模块的 log 信息
        print("Loading the module of initial_df ...")
        # 捕获异常
        try:
            # 数据框操作
            df_result = pd.DataFrame(list(initial_data), columns=["date_time", "request_id", "mac_wifi", "user_id", "query", "domain", "intent", "response_text"])
            # 新增 domain_is_right 列
            df_result['domain_is_right'] = ''
            # 新增 intent_is_right 列
            df_result['intent_is_right'] = ''
            # 新增 response_is_right 列
            df_result['response_is_right'] = ''
            # 按 date_time 排序
            df_result = df_result.sort_values(["date_time"], ascending = False)
            # 输出 log 信息
            print("The dimension of initial dataframe before duplicated: ", end = "")
            # 去重前的数据框维度
            print(df_result.shape)
            # query 列数据去重
            df_result = df_result.drop_duplicates(subset = "query")
            # 按 date_time 排序
            df_result = df_result.sort_values(["date_time"], ascending = True)
            # 输出 log 信息
            print("The dimension of initial dataframe: ", end = "")
            # 输出当前数据框的维度(去重后)
            print(df_result.shape)
            # 返回 df_result
            return df_result

        except Exception as e:
            print("The error of initial_df(): ", e)

        else:
            # 返回 df_result
            return df_result

        finally:
            # 输出退出 initial_df 模块的 log 信息
            print("Exiting the module of initial_df ...")