# -*- coding: utf-8 -*- # @Author: Gree # @Date: 2020-12-17 15:57:05 # @Last Modified by: Gree # @Last Modified time: 2021-06-01 14:53:06 import pandas as pd from config import conn_sql as cs from period import get_time as gt class DataDeal(object): """ 1. DataDeal 类主要用于获取所需要操作的数据框:原始数据框 """ def initial_df(self, initial_data): """ initial_df 函数: input: initial_data output: df_result features: 初始化数据框 step1: 数据框操作,数据去重、排序等操作 step2: 返回 df_result """ # 输出进入 initial_df 模块的 log 信息 print("Loading the module of initial_df ...") # 捕获异常 try: # 数据框操作 df_result = pd.DataFrame(list(initial_data), columns=["date_time", "request_id", "mac_wifi", "user_id", "query", "domain", "intent", "response_text"]) # 新增 domain_is_right 列 df_result['domain_is_right'] = '' # 新增 intent_is_right 列 df_result['intent_is_right'] = '' # 新增 response_is_right 列 df_result['response_is_right'] = '' # 按 date_time 排序 df_result = df_result.sort_values(["date_time"], ascending = False) # 输出 log 信息 print("The dimension of initial dataframe before duplicated: ", end = "") # 去重前的数据框维度 print(df_result.shape) # query 列数据去重 df_result = df_result.drop_duplicates(subset = "query") # 按 date_time 排序 df_result = df_result.sort_values(["date_time"], ascending = True) # 输出 log 信息 print("The dimension of initial dataframe: ", end = "") # 输出当前数据框的维度(去重后) print(df_result.shape) # 返回 df_result return df_result except Exception as e: print("The error of initial_df(): ", e) else: # 返回 df_result return df_result finally: # 输出退出 initial_df 模块的 log 信息 print("Exiting the module of initial_df ...")