readData.py 2.31 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91
# -*- coding: utf-8 -*-
# @Author: Gree
# @Date:   2021-05-29 15:03:21
# @Last Modified by:   Gree
# @Last Modified time: 2021-05-29 15:42:46


import configparser
import os

import csv
import pandas as pd

class ReadData:
    def __init__(self):
        """
        1. 初始化变量
        """
        # 获取 self.dirPath
        self.dirPath = os.path.split(os.path.realpath(__file__))[0]
        # 获取 self.filePath
        self.filePath = os.path.join(self.dirPath, "data.csv")
        # 生成 cf 对象
        self.cf = configparser.ConfigParser()
        # 定义 initial_list
        self.initial_list = []


    def readData(self):
        """
        1. 去读 csv 文件数据
        2. 处理数据,返回数据框
        """
        # 新建一个空列表接收元素
        initial_list = []
        # 捕获异常
        try:
            with open(self.filePath,'r',encoding="utf-8") as f:
                # 读取 csv 文件
                reader = csv.reader(f)
                # 遍历
                for row in reader:
                    # 列表添加元素
                    initial_list.append(row)

        except Exception as e:
            print("I/O error: ", e)

        else:
            # 返回 initial_list
            return initial_list

        finally:
            # 文件关闭
            f.close()



    def getData(self, initial_list):
        # 捕获异常
        try:
            # 数据框操作
            initial_df = pd.DataFrame(initial_list[1:], columns = initial_list[0])

            # 输出 log 信息
            print("数据框去重前:")
            print("The dimension of initial_df: ", end = "")
            # 数据框维度(去重前数据框维度)
            print(initial_df.shape)

        except Exception as e:
            print("The error of getting initial_df: ", e)

        # 捕获异常
        try:
            # 数据框列去重
            df = initial_df.drop_duplicates(subset = "query")

            # 输出 log 信息
            print("数据框去重后:")
            print("The dimension of df: ", end = "")
            # 数据框维度(去重后数据框维度)
            print(df.shape)

        except Exception as e:
            print("The error of getting df: ", e)


        else:
            # 返回数据框
            return df