From addbb1841f7ce552fb846cd1f2ba2e52ca4b13cd Mon Sep 17 00:00:00 2001
From: StudentCWZ <330459539@qq.com>
Date: Mon, 31 May 2021 16:29:13 +0800
Subject: [PATCH] a new project of testing stop-words effect

---
 README.md            |  47 ++++-
 config/__init__.py   |   0
 config/readConfig.py |  38 ++++
 config/stopWord.conf |   2 +
 data/__init__.py     |   0
 data/readData.py     |  91 +++++++++
 requirement.txt      | 256 +++++++++++++++++++++++++
 stopWordTest.py      | 443 +++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 874 insertions(+), 3 deletions(-)
 create mode 100644 config/__init__.py
 create mode 100644 config/readConfig.py
 create mode 100644 config/stopWord.conf
 create mode 100644 data/__init__.py
 create mode 100644 data/readData.py
 create mode 100644 requirement.txt
 create mode 100644 stopWordTest.py

diff --git a/README.md b/README.md
index 1bd3c96..62fd521 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,44 @@
-# StopWordTest
-
-It is a project of testing stop-words effect.
\ No newline at end of file
+# 项目简介
+- 项目名称：停用词效果测试
+- 功能说明：根据提供的停用词，测试停用词的效果。
+- 代码仓库：https://api.gree.com/gitlab/cuiweizhi/StopWordTest.git
+- 项目负责人：崔为之
+- 目录结构：
+```
+├─config  
+|  ├─__init__.py
+|  ├─stopWord.conf   // 配置文件
+|  ├─readConfig.py  // 读取配置脚本
+│  
+├─data
+│   ├─data.csv        //原始数据文件(过大，不上传)
+|   │  
+|   └─readData.py     //读取数据脚本
+|
+|
+├─result //最终数据存储路径文件夹 (运行主体代码自动生成)
+|
+|
+├─requirement.txt //python3 环境的配置
+|
+|
+└─stopWordTest.py // 主体代码
+```
+# 如何运行
+- 创建虚拟环境: python -m venv venv
+- 安装软件包： pip -r requirements.txt
+- 运行服务：
+```
+(1) 修改配置文件：stopWord.conf 和 data.csv (根据自己需要修改)
+(2) 运行服务：python3 stopWordTest.py
+```
+# 版本信息
+```
+v1.0
+```
+# 更新日志
+- v1.0 版本
+```
+(1) 首次创建项目
+(2) 提供技能
+```
diff --git a/config/__init__.py b/config/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/config/readConfig.py b/config/readConfig.py
new file mode 100644
index 0000000..22a74e2
--- /dev/null
+++ b/config/readConfig.py
@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+# @Author: Gree
+# @Date:   2021-05-29 14:49:12
+# @Last Modified by:   Gree
+# @Last Modified time: 2021-05-29 15:02:21
+
+
+import configparser
+import os
+
+
+
+class ReadConfig:
+    def __init__(self):
+        """
+        1. 初始化变量
+        """
+        # 获取 self.dirPath
+        self.dirPath = os.path.split(os.path.realpath(__file__))[0]
+        # 获取 self.filePath
+        self.filePath = os.path.join(self.dirPath, "stopWord.conf")
+        # 生成 cf 对象
+        self.cf = configparser.ConfigParser()
+
+
+    def readConfig(self):
+        """
+        1. 读取配置文件
+        """
+        # 读取 stopWord.conf 配置文件
+        self.cf.read(self.filePath)
+
+
+        # 获取 stopWordList
+        stopWordList = self.cf["stop_word"]["stop_word"].split(",")
+
+        # 返回 stopWordList
+        return stopWordList
diff --git a/config/stopWord.conf b/config/stopWord.conf
new file mode 100644
index 0000000..bd91648
--- /dev/null
+++ b/config/stopWord.conf
@@ -0,0 +1,2 @@
+[stop_word]
+stop_word = 格力金贝,格力空调,格力
\ No newline at end of file
diff --git a/data/__init__.py b/data/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/data/readData.py b/data/readData.py
new file mode 100644
index 0000000..28e6aa8
--- /dev/null
+++ b/data/readData.py
@@ -0,0 +1,91 @@
+# -*- coding: utf-8 -*-
+# @Author: Gree
+# @Date:   2021-05-29 15:03:21
+# @Last Modified by:   Gree
+# @Last Modified time: 2021-05-29 15:42:46
+
+
+import configparser
+import os
+
+import csv
+import pandas as pd
+
+class ReadData:
+    def __init__(self):
+        """
+        1. 初始化变量
+        """
+        # 获取 self.dirPath
+        self.dirPath = os.path.split(os.path.realpath(__file__))[0]
+        # 获取 self.filePath
+        self.filePath = os.path.join(self.dirPath, "data.csv")
+        # 生成 cf 对象
+        self.cf = configparser.ConfigParser()
+        # 定义 initial_list
+        self.initial_list = []
+
+
+    def readData(self):
+        """
+        1. 去读 csv 文件数据
+        2. 处理数据，返回数据框
+        """
+        # 新建一个空列表接收元素
+        initial_list = []
+        # 捕获异常
+        try:
+            with open(self.filePath,'r',encoding="utf-8") as f:
+                # 读取 csv 文件
+                reader = csv.reader(f)
+                # 遍历
+                for row in reader:
+                    # 列表添加元素
+                    initial_list.append(row)
+
+        except Exception as e:
+            print("I/O error: ", e)
+
+        else:
+            # 返回 initial_list
+            return initial_list
+
+        finally:
+            # 文件关闭
+            f.close()
+
+
+
+    def getData(self, initial_list):
+        # 捕获异常
+        try:
+            # 数据框操作
+            initial_df = pd.DataFrame(initial_list[1:], columns = initial_list[0])
+
+            # 输出 log 信息
+            print("数据框去重前：")
+            print("The dimension of initial_df: ", end = "")
+            # 数据框维度(去重前数据框维度)
+            print(initial_df.shape)
+
+        except Exception as e:
+            print("The error of getting initial_df: ", e)
+
+        # 捕获异常
+        try:
+            # 数据框列去重
+            df = initial_df.drop_duplicates(subset = "query")
+
+            # 输出 log 信息
+            print("数据框去重后：")
+            print("The dimension of df: ", end = "")
+            # 数据框维度(去重后数据框维度)
+            print(df.shape)
+
+        except Exception as e:
+            print("The error of getting df: ", e)
+
+
+        else:
+            # 返回数据框
+            return df
diff --git a/requirement.txt b/requirement.txt
new file mode 100644
index 0000000..582abe3
--- /dev/null
+++ b/requirement.txt
@@ -0,0 +1,256 @@
+alabaster @ file:///home/ktietz/src/ci/alabaster_1611921544520/work
+anaconda-client==1.7.2
+anaconda-navigator==2.0.3
+anaconda-project @ file:///tmp/build/80754af9/anaconda-project_1621348054992/work
+anyio @ file:///opt/concourse/worker/volumes/live/64740ac7-3a9c-4fbb-6685-a51c4ff8b4ca/volume/anyio_1617783319350/work/dist
+appdirs==1.4.4
+applaunchservices==0.2.1
+appnope @ file:///opt/concourse/worker/volumes/live/5f13e5b3-5355-4541-5fc3-f08850c73cf9/volume/appnope_1606859448618/work
+appscript @ file:///opt/concourse/worker/volumes/live/82e8b4c7-2416-4d10-509e-144ca79d9b1d/volume/appscript_1611426996703/work
+argh==0.26.2
+argon2-cffi @ file:///opt/concourse/worker/volumes/live/d733ceb5-7f19-407b-7da7-a386540ab855/volume/argon2-cffi_1613037492998/work
+asn1crypto @ file:///tmp/build/80754af9/asn1crypto_1596577642040/work
+astroid @ file:///opt/concourse/worker/volumes/live/343a8902-287c-47fb-6db8-923a63364302/volume/astroid_1613500849157/work
+astropy @ file:///opt/concourse/worker/volumes/live/0a514e04-301a-48f9-530f-90365df6420e/volume/astropy_1617745469121/work
+async-generator @ file:///home/ktietz/src/ci/async_generator_1611927993394/work
+atomicwrites==1.4.0
+attrs @ file:///tmp/build/80754af9/attrs_1620827162558/work
+autopep8 @ file:///tmp/build/80754af9/autopep8_1615918855173/work
+Babel @ file:///tmp/build/80754af9/babel_1620871417480/work
+backcall @ file:///home/ktietz/src/ci/backcall_1611930011877/work
+backports.functools-lru-cache @ file:///tmp/build/80754af9/backports.functools_lru_cache_1618170165463/work
+backports.shutil-get-terminal-size @ file:///tmp/build/80754af9/backports.shutil_get_terminal_size_1608222128777/work
+backports.tempfile @ file:///home/linux1/recipes/ci/backports.tempfile_1610991236607/work
+backports.weakref==1.0.post1
+beautifulsoup4 @ file:///home/linux1/recipes/ci/beautifulsoup4_1610988766420/work
+bitarray @ file:///opt/concourse/worker/volumes/live/8a51e4ff-5d78-46c1-45c6-5df5fa3f52c4/volume/bitarray_1620827546654/work
+bkcharts==0.2
+black==19.10b0
+bleach @ file:///tmp/build/80754af9/bleach_1612211392645/work
+bokeh @ file:///opt/concourse/worker/volumes/live/65ce9588-f765-4deb-5b52-1c552a693654/volume/bokeh_1620783891289/work
+boto==2.49.0
+Bottleneck==1.3.2
+brotlipy==0.7.0
+certifi==2020.12.5
+cffi @ file:///opt/concourse/worker/volumes/live/0ef369cc-6ba0-47e7-75da-208c6400381d/volume/cffi_1613246948181/work
+chardet @ file:///opt/concourse/worker/volumes/live/c798b2ee-88b1-4341-6830-161a92c2399e/volume/chardet_1607706832595/work
+click @ file:///tmp/build/80754af9/click_1621604852318/work
+cloudpickle @ file:///tmp/build/80754af9/cloudpickle_1598884132938/work
+clyent==1.2.2
+colorama @ file:///tmp/build/80754af9/colorama_1607707115595/work
+conda==4.10.1
+conda-build==3.21.4
+conda-content-trust @ file:///tmp/build/80754af9/conda-content-trust_1617045594566/work
+conda-pack @ file:///tmp/build/80754af9/conda-pack_1611163042455/work
+conda-package-handling @ file:///opt/concourse/worker/volumes/live/73497069-9b43-4ad9-50ec-1abb340e14eb/volume/conda-package-handling_1618262140058/work
+conda-repo-cli @ file:///tmp/build/80754af9/conda-repo-cli_1620168426516/work
+conda-token @ file:///tmp/build/80754af9/conda-token_1620076980546/work
+conda-verify==3.4.2
+contextlib2==0.6.0.post1
+cryptography @ file:///opt/concourse/worker/volumes/live/c515855a-effc-46df-74dc-542901b701da/volume/cryptography_1616769282442/work
+cycler==0.10.0
+Cython @ file:///opt/concourse/worker/volumes/live/da4db94a-3449-4978-4400-64181f888dab/volume/cython_1618435143829/work
+cytoolz==0.11.0
+dask @ file:///tmp/build/80754af9/dask-core_1617390489108/work
+decorator @ file:///home/ktietz/src/ci/decorator_1611930055503/work
+defusedxml @ file:///tmp/build/80754af9/defusedxml_1615228127516/work
+diff-match-patch @ file:///tmp/build/80754af9/diff-match-patch_1594828741838/work
+distributed @ file:///opt/concourse/worker/volumes/live/66d9a668-fe37-485b-57c1-fefe2be682be/volume/distributed_1621290174414/work
+docutils @ file:///opt/concourse/worker/volumes/live/5fb9307e-f900-46e0-7a3f-bf61cc0172f2/volume/docutils_1620827968414/work
+entrypoints==0.3
+et-xmlfile==1.1.0
+fake-useragent==0.1.11
+fastcache==1.1.0
+filelock @ file:///home/linux1/recipes/ci/filelock_1610993975404/work
+flake8 @ file:///tmp/build/80754af9/flake8_1615834841867/work
+Flask @ file:///home/ktietz/src/ci/flask_1611932660458/work
+fsspec @ file:///tmp/build/80754af9/fsspec_1617959894824/work
+future==0.18.2
+gevent @ file:///opt/concourse/worker/volumes/live/123efac8-0706-463e-4084-078386897222/volume/gevent_1616772940436/work
+glob2 @ file:///home/linux1/recipes/ci/glob2_1610991677669/work
+gmpy2==2.0.8
+greenlet @ file:///opt/concourse/worker/volumes/live/3ef4f3fa-d3c5-4d18-614a-db20c54a531f/volume/greenlet_1620913631134/work
+h5py==2.10.0
+HeapDict==1.0.1
+html5lib @ file:///tmp/build/80754af9/html5lib_1593446221756/work
+idna @ file:///home/linux1/recipes/ci/idna_1610986105248/work
+imageio @ file:///tmp/build/80754af9/imageio_1617700267927/work
+imagesize @ file:///home/ktietz/src/ci/imagesize_1611921604382/work
+importlib-metadata @ file:///opt/concourse/worker/volumes/live/a634a87c-b5e5-41bd-628d-cd0413666c93/volume/importlib-metadata_1617877368300/work
+iniconfig @ file:///home/linux1/recipes/ci/iniconfig_1610983019677/work
+intervaltree @ file:///tmp/build/80754af9/intervaltree_1598376443606/work
+ipykernel @ file:///opt/concourse/worker/volumes/live/88f541d3-5a27-498f-7391-f2e50ca36560/volume/ipykernel_1596206680118/work/dist/ipykernel-5.3.4-py3-none-any.whl
+ipython @ file:///opt/concourse/worker/volumes/live/c432d8a7-d8f3-4e24-590f-f03d7e5f35e1/volume/ipython_1617120884257/work
+ipython-genutils @ file:///tmp/build/80754af9/ipython_genutils_1606773439826/work
+ipywidgets @ file:///tmp/build/80754af9/ipywidgets_1610481889018/work
+isort @ file:///tmp/build/80754af9/isort_1616355431277/work
+itsdangerous @ file:///tmp/build/80754af9/itsdangerous_1621432558163/work
+jdcal==1.4.1
+jedi @ file:///opt/concourse/worker/volumes/live/12a2c347-a8e4-4b62-5b19-dcc92a2254f6/volume/jedi_1606932552286/work
+Jinja2 @ file:///tmp/build/80754af9/jinja2_1621238361758/work
+joblib @ file:///tmp/build/80754af9/joblib_1613502643832/work
+json5==0.9.5
+jsonschema @ file:///tmp/build/80754af9/jsonschema_1602607155483/work
+jupyter==1.0.0
+jupyter-client @ file:///tmp/build/80754af9/jupyter_client_1616770841739/work
+jupyter-console @ file:///tmp/build/80754af9/jupyter_console_1616615302928/work
+jupyter-core @ file:///opt/concourse/worker/volumes/live/c8df8dce-dbb3-46e7-649c-adf4ed2dd00a/volume/jupyter_core_1612213293829/work
+jupyter-packaging @ file:///tmp/build/80754af9/jupyter-packaging_1613502826984/work
+jupyter-server @ file:///opt/concourse/worker/volumes/live/4f2e970c-4cba-4227-75ae-10ebe3b5f9c3/volume/jupyter_server_1616084051538/work
+jupyterlab @ file:///tmp/build/80754af9/jupyterlab_1619133235951/work
+jupyterlab-pygments @ file:///tmp/build/80754af9/jupyterlab_pygments_1601490720602/work
+jupyterlab-server @ file:///tmp/build/80754af9/jupyterlab_server_1617134334258/work
+jupyterlab-widgets @ file:///tmp/build/80754af9/jupyterlab_widgets_1609884341231/work
+keyring @ file:///opt/concourse/worker/volumes/live/9bd395b7-ac9a-4d22-790b-ab9dc88e7c19/volume/keyring_1621524569678/work
+kiwisolver @ file:///opt/concourse/worker/volumes/live/0b2f3e77-eaa3-4995-7dd0-c994762fcbde/volume/kiwisolver_1612282417472/work
+lazy-object-proxy @ file:///opt/concourse/worker/volumes/live/e4bc3ba3-f365-4387-5772-cbb667714c62/volume/lazy-object-proxy_1616529072711/work
+libarchive-c @ file:///tmp/build/80754af9/python-libarchive-c_1617780486945/work
+llvmlite==0.36.0
+locket==0.2.1
+lxml @ file:///opt/concourse/worker/volumes/live/42fc56a4-88b4-4605-6037-9605f4e5eeeb/volume/lxml_1616443240194/work
+MarkupSafe @ file:///opt/concourse/worker/volumes/live/c9141381-1dba-485b-7c96-99007bf7bcfd/volume/markupsafe_1621528150226/work
+matplotlib @ file:///opt/concourse/worker/volumes/live/41e8cd50-031f-4dda-5787-dd3c4f4e0f08/volume/matplotlib-suite_1613407855571/work
+mccabe==0.6.1
+mistune @ file:///opt/concourse/worker/volumes/live/95802d64-d39c-491b-74ce-b9326880ca54/volume/mistune_1594373201816/work
+mkl-fft==1.3.0
+mkl-random @ file:///opt/concourse/worker/volumes/live/54b31a45-1da5-4512-5c3a-93c9ff2af8bc/volume/mkl_random_1618853970587/work
+mkl-service==2.3.0
+mock @ file:///tmp/build/80754af9/mock_1607622725907/work
+more-itertools @ file:///tmp/build/80754af9/more-itertools_1613676688952/work
+mpmath==1.2.1
+msgpack @ file:///opt/concourse/worker/volumes/live/d7400f3a-e5de-4e85-5d4c-0c984c648401/volume/msgpack-python_1612287157185/work
+multipledispatch==0.6.0
+mypy-extensions==0.4.3
+navigator-updater==0.2.1
+nbclassic @ file:///tmp/build/80754af9/nbclassic_1616085367084/work
+nbclient @ file:///tmp/build/80754af9/nbclient_1614364831625/work
+nbconvert @ file:///opt/concourse/worker/volumes/live/2b9c1d93-d0fd-432f-7d93-66c93d81b614/volume/nbconvert_1601914875037/work
+nbformat @ file:///tmp/build/80754af9/nbformat_1617383369282/work
+nest-asyncio @ file:///tmp/build/80754af9/nest-asyncio_1613680548246/work
+networkx @ file:///tmp/build/80754af9/networkx_1617653298338/work
+nltk @ file:///tmp/build/80754af9/nltk_1621347441292/work
+nose @ file:///tmp/build/80754af9/nose_1606773131901/work
+notebook @ file:///opt/concourse/worker/volumes/live/78fd3e35-67c2-490e-7bb9-0627a6db9485/volume/notebook_1621528340294/work
+numba @ file:///opt/concourse/worker/volumes/live/263a950e-7ddc-4297-63df-0c284f0b6d22/volume/numba_1616774255536/work
+numexpr @ file:///opt/concourse/worker/volumes/live/e845d683-bbb9-4fa2-79ce-743b84c61560/volume/numexpr_1618856522192/work
+numpy @ file:///opt/concourse/worker/volumes/live/6acd2784-8443-45a5-42ec-e10d2d0eaa28/volume/numpy_and_numpy_base_1620831186338/work
+numpydoc @ file:///tmp/build/80754af9/numpydoc_1605117425582/work
+olefile==0.46
+openpyxl @ file:///tmp/build/80754af9/openpyxl_1615411699337/work
+packaging @ file:///tmp/build/80754af9/packaging_1611952188834/work
+pandas==1.2.4
+pandocfilters @ file:///opt/concourse/worker/volumes/live/c330e404-216d-466b-5327-8ce8fe854d3a/volume/pandocfilters_1605120442288/work
+parso==0.7.0
+partd @ file:///tmp/build/80754af9/partd_1618000087440/work
+path @ file:///opt/concourse/worker/volumes/live/6493576b-552d-426b-432c-5e0fafcd8a43/volume/path_1614022213143/work
+pathlib2 @ file:///opt/concourse/worker/volumes/live/cca4007b-e85e-4f77-430e-d30b2149548d/volume/pathlib2_1607024978319/work
+pathspec==0.7.0
+pathtools==0.1.2
+patsy==0.5.1
+pep8==1.7.1
+pexpect @ file:///tmp/build/80754af9/pexpect_1605563209008/work
+pickleshare @ file:///tmp/build/80754af9/pickleshare_1606932040724/work
+Pillow @ file:///opt/concourse/worker/volumes/live/ca23594b-6e35-4c8c-5637-50ac0b550473/volume/pillow_1617386168018/work
+pkginfo==1.7.0
+pluggy @ file:///opt/concourse/worker/volumes/live/2d655872-b6f5-4225-538c-dd87e481f5c8/volume/pluggy_1615976700299/work
+ply==3.11
+prometheus-client @ file:///tmp/build/80754af9/prometheus_client_1618088486455/work
+prompt-toolkit @ file:///tmp/build/80754af9/prompt-toolkit_1616415428029/work
+psutil @ file:///opt/concourse/worker/volumes/live/0673cd4b-30c1-4470-7490-d8955610f5d5/volume/psutil_1612298002202/work
+ptyprocess @ file:///tmp/build/80754af9/ptyprocess_1609355006118/work/dist/ptyprocess-0.7.0-py2.py3-none-any.whl
+py @ file:///tmp/build/80754af9/py_1607971587848/work
+pycodestyle @ file:///home/ktietz/src/ci_mi/pycodestyle_1612807597675/work
+pycosat==0.6.3
+pycparser @ file:///tmp/build/80754af9/pycparser_1594388511720/work
+pycurl==7.43.0.6
+pydocstyle @ file:///tmp/build/80754af9/pydocstyle_1621600989141/work
+pyerfa @ file:///opt/concourse/worker/volumes/live/f87989f6-b347-4e0a-7be4-c3773473f3c6/volume/pyerfa_1621560794706/work
+pyflakes @ file:///home/ktietz/src/ci_ipy2/pyflakes_1612551159640/work
+Pygments @ file:///tmp/build/80754af9/pygments_1621606182707/work
+pylint @ file:///opt/concourse/worker/volumes/live/39bfc6db-5da7-4bc4-5240-e8a1167491ff/volume/pylint_1617135827184/work
+pyls-black @ file:///tmp/build/80754af9/pyls-black_1607553132291/work
+pyls-spyder @ file:///tmp/build/80754af9/pyls-spyder_1613849700860/work
+PyMySQL @ file:///opt/concourse/worker/volumes/live/91bc5664-f34a-482d-7efc-f19bbd7a1347/volume/pymysql_1610482895985/work
+pyodbc===4.0.0-unsupported
+pyOpenSSL @ file:///tmp/build/80754af9/pyopenssl_1608057966937/work
+pyparsing @ file:///home/linux1/recipes/ci/pyparsing_1610983426697/work
+pyrsistent @ file:///opt/concourse/worker/volumes/live/ff11f3f0-615b-4508-471d-4d9f19fa6657/volume/pyrsistent_1600141727281/work
+PySocks @ file:///opt/concourse/worker/volumes/live/85a5b906-0e08-41d9-6f59-084cee4e9492/volume/pysocks_1594394636991/work
+pytest==6.2.3
+python-dateutil @ file:///home/ktietz/src/ci/python-dateutil_1611928101742/work
+python-jsonrpc-server @ file:///tmp/build/80754af9/python-jsonrpc-server_1600278539111/work
+python-language-server @ file:///tmp/build/80754af9/python-language-server_1607972495879/work
+pytz @ file:///tmp/build/80754af9/pytz_1612215392582/work
+PyWavelets @ file:///opt/concourse/worker/volumes/live/ea36e10f-66e8-43ae-511e-c4092764493f/volume/pywavelets_1601658378672/work
+PyYAML==5.4.1
+pyzmq==20.0.0
+QDarkStyle==2.8.1
+QtAwesome @ file:///tmp/build/80754af9/qtawesome_1615991616277/work
+qtconsole @ file:///tmp/build/80754af9/qtconsole_1616775094278/work
+QtPy==1.9.0
+regex @ file:///opt/concourse/worker/volumes/live/e81f6b8c-e3b5-481b-6b5d-4fa8d9bb9405/volume/regex_1617569701251/work
+requests @ file:///tmp/build/80754af9/requests_1608241421344/work
+rope @ file:///tmp/build/80754af9/rope_1602264064449/work
+Rtree @ file:///opt/concourse/worker/volumes/live/7b97d6e1-aeee-4f6d-418c-32be5bbd5ed3/volume/rtree_1618420839839/work
+ruamel-yaml-conda @ file:///opt/concourse/worker/volumes/live/53b096c9-f5b7-4029-7f1b-056927554e08/volume/ruamel_yaml_1616016691174/work
+scikit-image==0.18.1
+scikit-learn @ file:///opt/concourse/worker/volumes/live/27ad05e5-2546-4218-4550-464724750acc/volume/scikit-learn_1621370399611/work
+scipy @ file:///opt/concourse/worker/volumes/live/7d10d993-3825-404e-6e5d-9947c19e8c6d/volume/scipy_1618855951189/work
+seaborn @ file:///tmp/build/80754af9/seaborn_1608578541026/work
+Send2Trash @ file:///tmp/build/80754af9/send2trash_1607525499227/work
+simplegeneric==0.8.1
+singledispatch @ file:///tmp/build/80754af9/singledispatch_1614366001199/work
+six @ file:///opt/concourse/worker/volumes/live/5b31cb27-1e37-4ca5-6e9f-86246eb206d2/volume/six_1605205320872/work
+sniffio @ file:///opt/concourse/worker/volumes/live/1faeb672-6d3f-4f1f-7861-294fcf282962/volume/sniffio_1614030462215/work
+snowballstemmer @ file:///tmp/build/80754af9/snowballstemmer_1611258885636/work
+sortedcollections @ file:///tmp/build/80754af9/sortedcollections_1611172717284/work
+sortedcontainers @ file:///tmp/build/80754af9/sortedcontainers_1606865132123/work
+soupsieve @ file:///tmp/build/80754af9/soupsieve_1616183228191/work
+Sphinx @ file:///tmp/build/80754af9/sphinx_1620777493457/work
+sphinxcontrib-applehelp @ file:///home/ktietz/src/ci/sphinxcontrib-applehelp_1611920841464/work
+sphinxcontrib-devhelp @ file:///home/ktietz/src/ci/sphinxcontrib-devhelp_1611920923094/work
+sphinxcontrib-htmlhelp @ file:///home/ktietz/src/ci/sphinxcontrib-htmlhelp_1611920974801/work
+sphinxcontrib-jsmath @ file:///home/ktietz/src/ci/sphinxcontrib-jsmath_1611920942228/work
+sphinxcontrib-qthelp @ file:///home/ktietz/src/ci/sphinxcontrib-qthelp_1611921055322/work
+sphinxcontrib-serializinghtml @ file:///home/ktietz/src/ci/sphinxcontrib-serializinghtml_1611920755253/work
+sphinxcontrib-websupport @ file:///tmp/build/80754af9/sphinxcontrib-websupport_1597081412696/work
+spyder @ file:///opt/concourse/worker/volumes/live/b2db1f66-1cfc-4529-6e21-61097ab49952/volume/spyder_1616775698806/work
+spyder-kernels @ file:///opt/concourse/worker/volumes/live/12a19b85-7733-4e39-55a2-982abb6f0274/volume/spyder-kernels_1614030593315/work
+SQLAlchemy @ file:///opt/concourse/worker/volumes/live/e51bfc47-a66d-4e2e-59af-339342299d8c/volume/sqlalchemy_1620712425990/work
+statsmodels @ file:///opt/concourse/worker/volumes/live/f55add71-53cb-4510-4c08-7ae9e294a482/volume/statsmodels_1614023740731/work
+sympy @ file:///opt/concourse/worker/volumes/live/0bf7d347-8399-4af8-5e6f-4267d4aabdd9/volume/sympy_1618255318469/work
+tables==3.6.1
+tblib @ file:///tmp/build/80754af9/tblib_1597928476713/work
+terminado==0.9.4
+testpath @ file:///home/ktietz/src/ci/testpath_1611930608132/work
+textdistance @ file:///tmp/build/80754af9/textdistance_1612461398012/work
+threadpoolctl @ file:///tmp/tmp9twdgx9k/threadpoolctl-2.1.0-py3-none-any.whl
+three-merge @ file:///tmp/build/80754af9/three-merge_1607553261110/work
+tifffile==2020.10.1
+toml @ file:///tmp/build/80754af9/toml_1616166611790/work
+toolz @ file:///home/linux1/recipes/ci/toolz_1610987900194/work
+tornado @ file:///opt/concourse/worker/volumes/live/05341796-4198-4ded-4a9a-332fde3cdfd1/volume/tornado_1606942323372/work
+tqdm @ file:///tmp/build/80754af9/tqdm_1615925068909/work
+traitlets @ file:///home/ktietz/src/ci/traitlets_1611929699868/work
+typed-ast @ file:///opt/concourse/worker/volumes/live/b635b0af-0037-435d-5a8b-38d096f36bbf/volume/typed-ast_1610484559102/work
+typing-extensions @ file:///home/ktietz/src/ci_mi/typing_extensions_1612808209620/work
+ujson @ file:///opt/concourse/worker/volumes/live/b4182deb-c6ce-4bc8-7783-61027c162049/volume/ujson_1611259506235/work
+unicodecsv==0.14.1
+urllib3 @ file:///tmp/build/80754af9/urllib3_1615837158687/work
+watchdog @ file:///opt/concourse/worker/volumes/live/ba071b65-d6ec-4539-5875-0791be503584/volume/watchdog_1612471127391/work
+wcwidth @ file:///tmp/build/80754af9/wcwidth_1593447189090/work
+webencodings==0.5.1
+Werkzeug @ file:///home/ktietz/src/ci/werkzeug_1611932622770/work
+widgetsnbextension==3.5.1
+wrapt @ file:///opt/concourse/worker/volumes/live/e3646d84-e961-4523-6bed-01532273c57e/volume/wrapt_1597851473852/work
+wurlitzer @ file:///opt/concourse/worker/volumes/live/bf42cded-f988-433c-5eca-a88c3905d29b/volume/wurlitzer_1617224650715/work
+xlrd @ file:///tmp/build/80754af9/xlrd_1608072521494/work
+XlsxWriter @ file:///tmp/build/80754af9/xlsxwriter_1617224712951/work
+xlwings==0.23.0
+xlwt==1.3.0
+xmltodict==0.12.0
+yapf @ file:///tmp/build/80754af9/yapf_1615749224965/work
+zict==2.0.0
+zipp @ file:///tmp/build/80754af9/zipp_1615904174917/work
+zope.event==4.5.0
+zope.interface @ file:///opt/concourse/worker/volumes/live/2b8309a3-732c-445e-5670-ab75beb9637c/volume/zope.interface_1616357204515/work
diff --git a/stopWordTest.py b/stopWordTest.py
new file mode 100644
index 0000000..acb19e7
--- /dev/null
+++ b/stopWordTest.py
@@ -0,0 +1,443 @@
+# -*- coding: utf-8 -*-
+# @Author: Gree
+# @Date:   2021-05-29 15:26:57
+# @Last Modified by:   Gree
+# @Last Modified time: 2021-05-31 13:48:14
+
+
+import json
+import os
+import time
+
+
+# 第三方库
+import csv
+import matplotlib as mpl
+import numpy as np
+import pandas as pd
+import requests
+from config import readConfig as rc
+from data import readData as rd
+from tqdm import tqdm
+from matplotlib import pyplot as plt
+from matplotlib.font_manager import FontProperties
+
+
+class StopWordTest:
+    def __init__(self):
+        """
+        __init__ 函数:
+        input:
+        output:
+        features: 定义初始变量，输出初始日志信息
+        step1: 定义 self.dirPath、self.url、self.headers 等对象
+        """
+        # 输出 log 信息
+        print("#" * 30, end = "")
+        # 输出 log 信息
+        print("Loading the module of stopWordTest ....", end = "")
+        # 输出 log 信息
+        print("#" * 30)
+        # 定义 self.dirPath
+        self.dirPath = os.path.split(os.path.realpath(__file__))[0] + "/result"
+        # 测试接口的 url
+        self.url = "https://testnlu.gree.com/regnlu/query"
+        # 定义请求头
+        self.headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
+        }
+        # 定义 self.initial_list
+        self.initial_list = rd.ReadData().readData()
+        # 定义 initial_df
+        self.initial_df = rd.ReadData().getData(self.initial_list)
+        # 定义 self.stopWordList
+        self.stopWordList = rc.ReadConfig().readConfig()
+        # 定义 self.correctRateLst
+        self.correctRateLst = []
+        # 定义 self.errorRateLst
+        self.errorRateLst = []
+
+
+    def dealStopWord(self, stopWord):
+        """
+        dealStopWord 函数:
+        input: stopWord
+        output: df
+        features: 处理停用词，去噪声
+        step1: 进行停用词处理，去噪声
+        step2: 返回经过处理后数据框
+        """
+        # 输出 log 信息
+        print("==========================进入停用词处理============================")
+        # 输出 log 信息
+        print("正在处理停用词：", stopWord)
+
+        # 新建 dealStopWordList
+        dealStopWordList = []
+        # copy self.df
+        df = self.initial_df.copy()
+
+        # 遍历
+        for index, row in df.iterrows():
+            # 捕获异常
+            try:
+                # 条件判断
+                if stopWord in row["query"]:
+                    # 赋值
+                    initial_query = row["query"]
+                    # 字符串替换
+                    row["query"] = row["query"].replace(stopWord, "")
+                    # 条件判断
+                    if row["query"] != "":
+                        # 构建字典
+                        stopWordDict = {
+                            "domain": row["domain"],
+                            "intent": row["intent"],
+                            "initial_query": initial_query,
+                            "query": row["query"]
+                        }
+
+                        # 列表添加元素
+                        dealStopWordList.append(stopWordDict)
+
+                    else:
+                        pass
+
+                else:
+                    pass
+
+            except Exception as e:
+                print("The error of dealing stopWord: ", e)
+
+        # 捕获异常
+        try:
+            # 列表转为数据框
+            df = pd.DataFrame(dealStopWordList, columns = ["domain","intent", "initial_query", "query"])
+            # 数据去重
+            # df = df.drop_duplicates(subset = "query")
+            # 输出 log 信息
+            print("停用词处理结束 ...")
+            # 输出 log 信息
+            print("The dimension of df after dealing stopWord: ", end = "")
+            # 数据数据框的维度
+            print(df.shape)
+            # 输出 log 信息
+            print("==========================退出停用词处理============================")
+            # 返回数据框
+            return df
+
+        except Exception as e:
+            print("The error of converting list to data frame: ", e)
+
+
+    def dataTest(self, df):
+        """
+        dataTest 函数:
+        input: df
+        output: df
+        features: 请求测试接口，返回测试数据
+        step1: 请求测试接口：https://nlu.gree.com/test2/regnlu/query
+        step2: 返回 df
+        """
+        # 输出 log 信息
+        print("=========================批量发起网络请求===========================")
+        # 输出 log 信息
+        print("Requesting url ...")
+
+        # copy df
+        df = df.copy()
+
+        # 捕获异常
+        try:
+            # for 循环
+            for k in tqdm(range(0,len(df["query"]))):
+                # 捕获异常
+                try:
+                    # 获取字段
+                    query = df["query"].iloc[k]
+                    # 定义 para
+                    para = {"clientId":"test_zww","sessionId":"test_zww","agentId":"YTYxMjRmZDY1NGM1","token":"bb2c960a3a8e018fab0a7b1bccfab91d3855","query":query }
+                    # 字典转字符串
+                    para = json.dumps(para)
+                    # 发起 post 请求
+                    r = requests.post(url = self.url, headers = self.headers, data = para, timeout = 60)
+
+                except Exception as e:
+                    print("The error of requesting url: ", e)
+
+                # 捕获异常
+                try:
+                    # 数据框操作
+                    df.loc[k,"url_domain"] = json.loads(r.text).get("semantic").get("service")
+                    df.loc[k,"url_intent"] = json.loads(r.text).get("semantic").get("action")
+
+                except Exception as e:
+                    print("The error of operating dataframe: ", e)
+
+        except Exception as e:
+            print("The error of dataTest(): ", e)
+
+        else:
+            # 返回 df
+            return df
+
+        finally:
+            # 输出 log 信息
+            print("The process of requesting url finished!")
+            print("=========================结束所有网络请求===========================")
+
+
+    def dataEvaluation(self, stopWord, df, countCorrect, countError):
+        """
+        dataEvaluation 函数:
+        input: stopWord, df, countCorrect, countError
+        output: df
+        features: 对测试完的数据，进行评估
+        step1: 对测试完的数据，进行评估，计算语料识别的正确率和错误率
+        step2: 返回 correctRate, errorRate, rowCorrectDictLst, rowErrorDictLst
+        """
+        # copy df
+        df = df.copy()
+        # 新建一个 list 接收元素
+        rowCorrectDictLst = []
+        # 新建一个 list 接收元素
+        rowErrorDictLst = []
+        # for 循环
+        for index, row in df.iterrows():
+            # 捕获异常
+            try:
+                # 条件判断
+                if row["domain"] == "chat" and row["url_domain"] != "":
+                    # 构建字典
+                    rowCorrectDict = {
+                        "domain": row["domain"],
+                        "intent": row["intent"],
+                        "initial_query": row["initial_query"],
+                        "query": row["query"],
+                        "url_domain": row["url_domain"],
+                        "url_intent": row["url_intent"]
+                    }
+                    # 列表添加元素
+                    rowCorrectDictLst.append(rowCorrectDict)
+                    # 计数
+                    countCorrect += 1
+
+                else:
+                    pass
+
+            except Exception as e:
+                print("The error of counting correct corpus: ", e)
+
+
+        # for 循环
+        for index, row in df.iterrows():
+            # 捕获异常
+            try:
+                # 条件判断
+                if row["domain"] != "chat" and row["url_domain"] != "":
+                    # 构建字典
+                    rowErrorDict = {
+                        "domain": row["domain"],
+                        "intent": row["intent"],
+                        "initial_query": row["initial_query"],
+                        "query": row["query"],
+                        "url_domain": row["url_domain"],
+                        "url_intent": row["url_intent"]
+                    }
+                    rowErrorDictLst.append(rowErrorDict)
+                    # 计数
+                    countError += 1
+
+                else:
+                    pass
+
+            except Exception as e:
+                print("The error of counting error corpus: ", e)
+
+        # 输出 log 日志
+        print("=====================计算正确识别语料数量和比率=====================")
+
+        # 计算正确识别语料比率
+        correctRate = countCorrect / df.shape[0]
+        # 输出 countOne
+        print("正确识别语料数量：", str(countCorrect))
+        # 输出 correctRate
+        print("正确识别语料比率：", str(correctRate))
+
+        # 输出 log 日志
+        print("=============================计算结束===============================")
+
+
+        # 输出 log 信息
+        print("=====================计算错误识别语料数量和比率=====================")
+
+        # 计算错误识别语料比率
+        errorRate = countError / df.shape[0]
+
+        # 输出 log 日志
+        print("错误识别语料数量：", str(countError))
+        print("错误识别语料比率：", str(errorRate))
+
+        # 输出 log 日志
+        print("=============================计算结束===============================")
+
+        # 返回 correctRate, errorRate
+        return correctRate, errorRate, rowCorrectDictLst, rowErrorDictLst
+
+
+    def WriteData(self, filePath, lst):
+        """
+        WriteData 函数:
+        input: filePath, lst
+        output: 数据持久化
+        features: 对最终数据进行数据持久化
+        step1: 文件写入
+        """
+        # 输出 log 信息
+        print("Writing file:", filePath)
+        # 打开文件
+        with open(filePath, 'w+', encoding = "utf-8") as f:
+            # I/O 操作
+            writer = csv.DictWriter(f, fieldnames = ("domain", "intent", "initial_query", "query", "url_domain", "url_intent"))
+            # 写入头
+            writer.writeheader()
+            # for 循环
+            for item in lst:
+                # 文件写入
+                writer.writerow(item)
+            # 结束 I/O 操作
+            f.close()
+
+
+    def drawPicture(self, correctRateLst, errorRateLst):
+        """
+        drawPicture 函数:
+        input: correctRateLst, errorRateLst
+        output: 图形
+        features: 将语料识别的正确率和错误率进行绘图
+        step1: 图形绘制
+        """
+        # 输出 log 信息
+        print("=============================正在绘图===============================")
+
+        # 设置中文
+        mpl.rcParams["font.sans-serif"] = ["Songti SC"]
+        mpl.rcParams["axes.unicode_minus"] = False
+
+        # 变量 x
+        x = np.arange(3)
+        # 变量 y1
+        y1 = correctRateLst
+        # 变量 y2
+        y2 = errorRateLst
+        # 设置宽度
+        barWidth = 0.35
+        # 新建列表接收 tick_label
+        tickLabelLst = []
+        # for 循环
+        for i in range(1, len(correctRateLst) + 1):
+            # 变量 item
+            tick_label = "corpus{}".format(i)
+            # 列表添加元素
+            tickLabelLst.append(tick_label)
+
+        # 绘制条形图
+        plt.bar(x, y1, barWidth, align = "center", color = "c", label = "正确识别语料比率", alpha = 0.5)
+        plt.bar(x + barWidth, y2, barWidth, color = "b", align = "center", label = "错误识别语料比率", alpha = 0.5)
+
+        # 设置 label
+        plt.xlabel("相关语料")
+        plt.ylabel("比率")
+
+        plt.xticks(x + barWidth/2, tickLabelLst)
+        plt.legend()
+        # 图片展示
+        plt.show()
+
+        # 输出 log 信息
+        print("=============================退出绘图===============================")
+
+
+    def main(self):
+        """
+        main 函数：
+        input:
+        output: 数据持久化、绘图
+        features: 运行主体代码
+        step1: 判断存放最终数据的目录是否存在，不存在则创建
+        step2: 停用词处理、接口测试、文件写入、绘图
+        step3: 将最终得到的数据持久化
+        """
+        # 输出 log 信息
+        print("==========================判断文件存在性============================")
+        # 判断文件是否存在，若不存在则创建
+        if os.path.exists(self.dirPath):
+            # 如果目录存在则不创建，并提示目录已存在
+            print(self.dirPath + "：目录已存在")
+        else:
+            os.makedirs(self.dirPath)
+            print(self.dirPath + "：目录创建成功")
+        # 输出 log 信息
+        print("=============================判断结束===============================")
+        # 遍历
+        for stopWord in self.stopWordList:
+            # 停用词处理
+            df = self.dealStopWord(stopWord = stopWord)
+            # 语料测试
+            test_df = self.dataTest(df = df)
+            # 语料评估
+            correctRate, errorRate, correctDictLst, errorDictLst = self.dataEvaluation(stopWord = stopWord, df = test_df, countCorrect = 0, countError = 0)
+            # 存放正确识别语料的文件路径
+            correctFilePath = os.path.join(self.dirPath, stopWord + "_correct.csv")
+            # 存放错误识别语料的文件路径
+            errorFilePath = os.path.join(self.dirPath, stopWord + "_error.csv")
+            # 输出 log 信息
+            print("=============================写入文件===============================")
+            # 文件写入
+            self.WriteData(correctFilePath, correctDictLst)
+            # 文件写入
+            self.WriteData(errorFilePath, errorDictLst)
+            # 输出 log 信息
+            print("=============================写入完成===============================")
+            # 列表添加元素
+            self.correctRateLst.append(correctRate)
+            # 列表添加元素
+            self.errorRateLst.append(errorRate)
+
+        # 绘图
+        self.drawPicture(self.correctRateLst, self.errorRateLst)
+
+
+
+
+
+if __name__ == '__main__':
+    stopWordTest = StopWordTest()
+    stopWordTest.main()
+    # 输出 log 信息
+    print("#" * 30, end = "")
+    # 输出 log 信息
+    print("The process of stopWordTest finished!", end = "")
+    # 输出 log 信息
+    print("#" * 30)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-- 
GitLab