duplicate_check.py 1.77 KB
Newer Older
StudentCWZ's avatar
StudentCWZ committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
# -*- coding: utf-8 -*-
# @Author: Gree
# @Date:   2020-12-17 16:03:28
# @Last Modified by:   StudentCWZ
# @Last Modified time: 2020-12-26 08:59:41


import conn_sql as cs
import pandas as pd
import data_deal as dd


class DuplicateCheck(object):
    """
    1. DuplicateCheck 类主要是为了获取原始数据的query字段与目前已经入库的且正确分类数据 query 列进行对比。
    2. 如果相等,该条数据在原始数据中被删除,如果不等,则保留。
    3. 该类可以避免相同的 query 字段数据进入自动化分类标注模块。
    """

    def __init__(self):
        self.query_lst = []


    def duplicate_check(self, contrast_data, initial_df):
        """
        1. 获取对比数据集,利用 query 字段进行去重对比。
        """
        print("Loading the module of duplicate_check ...")
        try:
            df_duplicate = pd.DataFrame(list(contrast_data), columns=['datetime', 'query'])
            df_duplicate = df_duplicate.drop_duplicates(subset = 'query') # query 列数据去重


            for index, row in df_duplicate.iterrows():
                self.query_lst.append(row['query'])

            # print(len(self.query_lst))

        except Exception as e:
            print(e)




        try:
            for index, row in initial_df.iterrows():
                if row['query'] in self.query_lst:
                    initial_df.drop(index=index)
                else:
                    pass

            input_df = initial_df

            print('The dimension of input dataframe: ', end='')
            print(input_df.shape)  # 输出当前数据框的维度
            print("Get input data successfully!")

            return input_df



        except Exception as e:
            print(e)