From addbb1841f7ce552fb846cd1f2ba2e52ca4b13cd Mon Sep 17 00:00:00 2001 From: StudentCWZ <330459539@qq.com> Date: Mon, 31 May 2021 16:29:13 +0800 Subject: [PATCH] a new project of testing stop-words effect --- README.md | 47 ++++- config/__init__.py | 0 config/readConfig.py | 38 ++++ config/stopWord.conf | 2 + data/__init__.py | 0 data/readData.py | 91 +++++++++ requirement.txt | 256 +++++++++++++++++++++++++ stopWordTest.py | 443 +++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 874 insertions(+), 3 deletions(-) create mode 100644 config/__init__.py create mode 100644 config/readConfig.py create mode 100644 config/stopWord.conf create mode 100644 data/__init__.py create mode 100644 data/readData.py create mode 100644 requirement.txt create mode 100644 stopWordTest.py diff --git a/README.md b/README.md index 1bd3c96..62fd521 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,44 @@ -# StopWordTest - -It is a project of testing stop-words effect. \ No newline at end of file +# 项目简介 +- 项目名称:停用词效果测试 +- 功能说明:根据提供的停用词,测试停用词的效果。 +- 代码仓库:https://api.gree.com/gitlab/cuiweizhi/StopWordTest.git +- 项目负责人:崔为之 +- 目录结构: +``` +├─config +| ├─__init__.py +| ├─stopWord.conf // 配置文件 +| ├─readConfig.py // 读取配置脚本 +│ +├─data +│ ├─data.csv //原始数据文件(过大,不上传) +| │ +| └─readData.py //读取数据脚本 +| +| +├─result //最终数据存储路径文件夹 (运行主体代码自动生成) +| +| +├─requirement.txt //python3 环境的配置 +| +| +└─stopWordTest.py // 主体代码 +``` +# 如何运行 +- 创建虚拟环境: python -m venv venv +- 安装软件包: pip -r requirements.txt +- 运行服务: +``` +(1) 修改配置文件:stopWord.conf 和 data.csv (根据自己需要修改) +(2) 运行服务:python3 stopWordTest.py +``` +# 版本信息 +``` +v1.0 +``` +# 更新日志 +- v1.0 版本 +``` +(1) 首次创建项目 +(2) 提供技能 +``` diff --git a/config/__init__.py b/config/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/config/readConfig.py b/config/readConfig.py new file mode 100644 index 0000000..22a74e2 --- /dev/null +++ b/config/readConfig.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +# @Author: Gree +# @Date: 2021-05-29 14:49:12 +# @Last Modified by: Gree +# @Last Modified time: 2021-05-29 15:02:21 + + +import configparser +import os + + + +class ReadConfig: + def __init__(self): + """ + 1. 初始化变量 + """ + # 获取 self.dirPath + self.dirPath = os.path.split(os.path.realpath(__file__))[0] + # 获取 self.filePath + self.filePath = os.path.join(self.dirPath, "stopWord.conf") + # 生成 cf 对象 + self.cf = configparser.ConfigParser() + + + def readConfig(self): + """ + 1. 读取配置文件 + """ + # 读取 stopWord.conf 配置文件 + self.cf.read(self.filePath) + + + # 获取 stopWordList + stopWordList = self.cf["stop_word"]["stop_word"].split(",") + + # 返回 stopWordList + return stopWordList diff --git a/config/stopWord.conf b/config/stopWord.conf new file mode 100644 index 0000000..bd91648 --- /dev/null +++ b/config/stopWord.conf @@ -0,0 +1,2 @@ +[stop_word] +stop_word = 格力金贝,格力空调,格力 \ No newline at end of file diff --git a/data/__init__.py b/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/data/readData.py b/data/readData.py new file mode 100644 index 0000000..28e6aa8 --- /dev/null +++ b/data/readData.py @@ -0,0 +1,91 @@ +# -*- coding: utf-8 -*- +# @Author: Gree +# @Date: 2021-05-29 15:03:21 +# @Last Modified by: Gree +# @Last Modified time: 2021-05-29 15:42:46 + + +import configparser +import os + +import csv +import pandas as pd + +class ReadData: + def __init__(self): + """ + 1. 初始化变量 + """ + # 获取 self.dirPath + self.dirPath = os.path.split(os.path.realpath(__file__))[0] + # 获取 self.filePath + self.filePath = os.path.join(self.dirPath, "data.csv") + # 生成 cf 对象 + self.cf = configparser.ConfigParser() + # 定义 initial_list + self.initial_list = [] + + + def readData(self): + """ + 1. 去读 csv 文件数据 + 2. 处理数据,返回数据框 + """ + # 新建一个空列表接收元素 + initial_list = [] + # 捕获异常 + try: + with open(self.filePath,'r',encoding="utf-8") as f: + # 读取 csv 文件 + reader = csv.reader(f) + # 遍历 + for row in reader: + # 列表添加元素 + initial_list.append(row) + + except Exception as e: + print("I/O error: ", e) + + else: + # 返回 initial_list + return initial_list + + finally: + # 文件关闭 + f.close() + + + + def getData(self, initial_list): + # 捕获异常 + try: + # 数据框操作 + initial_df = pd.DataFrame(initial_list[1:], columns = initial_list[0]) + + # 输出 log 信息 + print("数据框去重前:") + print("The dimension of initial_df: ", end = "") + # 数据框维度(去重前数据框维度) + print(initial_df.shape) + + except Exception as e: + print("The error of getting initial_df: ", e) + + # 捕获异常 + try: + # 数据框列去重 + df = initial_df.drop_duplicates(subset = "query") + + # 输出 log 信息 + print("数据框去重后:") + print("The dimension of df: ", end = "") + # 数据框维度(去重后数据框维度) + print(df.shape) + + except Exception as e: + print("The error of getting df: ", e) + + + else: + # 返回数据框 + return df diff --git a/requirement.txt b/requirement.txt new file mode 100644 index 0000000..582abe3 --- /dev/null +++ b/requirement.txt @@ -0,0 +1,256 @@ +alabaster @ file:///home/ktietz/src/ci/alabaster_1611921544520/work +anaconda-client==1.7.2 +anaconda-navigator==2.0.3 +anaconda-project @ file:///tmp/build/80754af9/anaconda-project_1621348054992/work +anyio @ file:///opt/concourse/worker/volumes/live/64740ac7-3a9c-4fbb-6685-a51c4ff8b4ca/volume/anyio_1617783319350/work/dist +appdirs==1.4.4 +applaunchservices==0.2.1 +appnope @ file:///opt/concourse/worker/volumes/live/5f13e5b3-5355-4541-5fc3-f08850c73cf9/volume/appnope_1606859448618/work +appscript @ file:///opt/concourse/worker/volumes/live/82e8b4c7-2416-4d10-509e-144ca79d9b1d/volume/appscript_1611426996703/work +argh==0.26.2 +argon2-cffi @ file:///opt/concourse/worker/volumes/live/d733ceb5-7f19-407b-7da7-a386540ab855/volume/argon2-cffi_1613037492998/work +asn1crypto @ file:///tmp/build/80754af9/asn1crypto_1596577642040/work +astroid @ file:///opt/concourse/worker/volumes/live/343a8902-287c-47fb-6db8-923a63364302/volume/astroid_1613500849157/work +astropy @ file:///opt/concourse/worker/volumes/live/0a514e04-301a-48f9-530f-90365df6420e/volume/astropy_1617745469121/work +async-generator @ file:///home/ktietz/src/ci/async_generator_1611927993394/work +atomicwrites==1.4.0 +attrs @ file:///tmp/build/80754af9/attrs_1620827162558/work +autopep8 @ file:///tmp/build/80754af9/autopep8_1615918855173/work +Babel @ file:///tmp/build/80754af9/babel_1620871417480/work +backcall @ file:///home/ktietz/src/ci/backcall_1611930011877/work +backports.functools-lru-cache @ file:///tmp/build/80754af9/backports.functools_lru_cache_1618170165463/work +backports.shutil-get-terminal-size @ file:///tmp/build/80754af9/backports.shutil_get_terminal_size_1608222128777/work +backports.tempfile @ file:///home/linux1/recipes/ci/backports.tempfile_1610991236607/work +backports.weakref==1.0.post1 +beautifulsoup4 @ file:///home/linux1/recipes/ci/beautifulsoup4_1610988766420/work +bitarray @ file:///opt/concourse/worker/volumes/live/8a51e4ff-5d78-46c1-45c6-5df5fa3f52c4/volume/bitarray_1620827546654/work +bkcharts==0.2 +black==19.10b0 +bleach @ file:///tmp/build/80754af9/bleach_1612211392645/work +bokeh @ file:///opt/concourse/worker/volumes/live/65ce9588-f765-4deb-5b52-1c552a693654/volume/bokeh_1620783891289/work +boto==2.49.0 +Bottleneck==1.3.2 +brotlipy==0.7.0 +certifi==2020.12.5 +cffi @ file:///opt/concourse/worker/volumes/live/0ef369cc-6ba0-47e7-75da-208c6400381d/volume/cffi_1613246948181/work +chardet @ file:///opt/concourse/worker/volumes/live/c798b2ee-88b1-4341-6830-161a92c2399e/volume/chardet_1607706832595/work +click @ file:///tmp/build/80754af9/click_1621604852318/work +cloudpickle @ file:///tmp/build/80754af9/cloudpickle_1598884132938/work +clyent==1.2.2 +colorama @ file:///tmp/build/80754af9/colorama_1607707115595/work +conda==4.10.1 +conda-build==3.21.4 +conda-content-trust @ file:///tmp/build/80754af9/conda-content-trust_1617045594566/work +conda-pack @ file:///tmp/build/80754af9/conda-pack_1611163042455/work +conda-package-handling @ file:///opt/concourse/worker/volumes/live/73497069-9b43-4ad9-50ec-1abb340e14eb/volume/conda-package-handling_1618262140058/work +conda-repo-cli @ file:///tmp/build/80754af9/conda-repo-cli_1620168426516/work +conda-token @ file:///tmp/build/80754af9/conda-token_1620076980546/work +conda-verify==3.4.2 +contextlib2==0.6.0.post1 +cryptography @ file:///opt/concourse/worker/volumes/live/c515855a-effc-46df-74dc-542901b701da/volume/cryptography_1616769282442/work +cycler==0.10.0 +Cython @ file:///opt/concourse/worker/volumes/live/da4db94a-3449-4978-4400-64181f888dab/volume/cython_1618435143829/work +cytoolz==0.11.0 +dask @ file:///tmp/build/80754af9/dask-core_1617390489108/work +decorator @ file:///home/ktietz/src/ci/decorator_1611930055503/work +defusedxml @ file:///tmp/build/80754af9/defusedxml_1615228127516/work +diff-match-patch @ file:///tmp/build/80754af9/diff-match-patch_1594828741838/work +distributed @ file:///opt/concourse/worker/volumes/live/66d9a668-fe37-485b-57c1-fefe2be682be/volume/distributed_1621290174414/work +docutils @ file:///opt/concourse/worker/volumes/live/5fb9307e-f900-46e0-7a3f-bf61cc0172f2/volume/docutils_1620827968414/work +entrypoints==0.3 +et-xmlfile==1.1.0 +fake-useragent==0.1.11 +fastcache==1.1.0 +filelock @ file:///home/linux1/recipes/ci/filelock_1610993975404/work +flake8 @ file:///tmp/build/80754af9/flake8_1615834841867/work +Flask @ file:///home/ktietz/src/ci/flask_1611932660458/work +fsspec @ file:///tmp/build/80754af9/fsspec_1617959894824/work +future==0.18.2 +gevent @ file:///opt/concourse/worker/volumes/live/123efac8-0706-463e-4084-078386897222/volume/gevent_1616772940436/work +glob2 @ file:///home/linux1/recipes/ci/glob2_1610991677669/work +gmpy2==2.0.8 +greenlet @ file:///opt/concourse/worker/volumes/live/3ef4f3fa-d3c5-4d18-614a-db20c54a531f/volume/greenlet_1620913631134/work +h5py==2.10.0 +HeapDict==1.0.1 +html5lib @ file:///tmp/build/80754af9/html5lib_1593446221756/work +idna @ file:///home/linux1/recipes/ci/idna_1610986105248/work +imageio @ file:///tmp/build/80754af9/imageio_1617700267927/work +imagesize @ file:///home/ktietz/src/ci/imagesize_1611921604382/work +importlib-metadata @ file:///opt/concourse/worker/volumes/live/a634a87c-b5e5-41bd-628d-cd0413666c93/volume/importlib-metadata_1617877368300/work +iniconfig @ file:///home/linux1/recipes/ci/iniconfig_1610983019677/work +intervaltree @ file:///tmp/build/80754af9/intervaltree_1598376443606/work +ipykernel @ file:///opt/concourse/worker/volumes/live/88f541d3-5a27-498f-7391-f2e50ca36560/volume/ipykernel_1596206680118/work/dist/ipykernel-5.3.4-py3-none-any.whl +ipython @ file:///opt/concourse/worker/volumes/live/c432d8a7-d8f3-4e24-590f-f03d7e5f35e1/volume/ipython_1617120884257/work +ipython-genutils @ file:///tmp/build/80754af9/ipython_genutils_1606773439826/work +ipywidgets @ file:///tmp/build/80754af9/ipywidgets_1610481889018/work +isort @ file:///tmp/build/80754af9/isort_1616355431277/work +itsdangerous @ file:///tmp/build/80754af9/itsdangerous_1621432558163/work +jdcal==1.4.1 +jedi @ file:///opt/concourse/worker/volumes/live/12a2c347-a8e4-4b62-5b19-dcc92a2254f6/volume/jedi_1606932552286/work +Jinja2 @ file:///tmp/build/80754af9/jinja2_1621238361758/work +joblib @ file:///tmp/build/80754af9/joblib_1613502643832/work +json5==0.9.5 +jsonschema @ file:///tmp/build/80754af9/jsonschema_1602607155483/work +jupyter==1.0.0 +jupyter-client @ file:///tmp/build/80754af9/jupyter_client_1616770841739/work +jupyter-console @ file:///tmp/build/80754af9/jupyter_console_1616615302928/work +jupyter-core @ file:///opt/concourse/worker/volumes/live/c8df8dce-dbb3-46e7-649c-adf4ed2dd00a/volume/jupyter_core_1612213293829/work +jupyter-packaging @ file:///tmp/build/80754af9/jupyter-packaging_1613502826984/work +jupyter-server @ file:///opt/concourse/worker/volumes/live/4f2e970c-4cba-4227-75ae-10ebe3b5f9c3/volume/jupyter_server_1616084051538/work +jupyterlab @ file:///tmp/build/80754af9/jupyterlab_1619133235951/work +jupyterlab-pygments @ file:///tmp/build/80754af9/jupyterlab_pygments_1601490720602/work +jupyterlab-server @ file:///tmp/build/80754af9/jupyterlab_server_1617134334258/work +jupyterlab-widgets @ file:///tmp/build/80754af9/jupyterlab_widgets_1609884341231/work +keyring @ file:///opt/concourse/worker/volumes/live/9bd395b7-ac9a-4d22-790b-ab9dc88e7c19/volume/keyring_1621524569678/work +kiwisolver @ file:///opt/concourse/worker/volumes/live/0b2f3e77-eaa3-4995-7dd0-c994762fcbde/volume/kiwisolver_1612282417472/work +lazy-object-proxy @ file:///opt/concourse/worker/volumes/live/e4bc3ba3-f365-4387-5772-cbb667714c62/volume/lazy-object-proxy_1616529072711/work +libarchive-c @ file:///tmp/build/80754af9/python-libarchive-c_1617780486945/work +llvmlite==0.36.0 +locket==0.2.1 +lxml @ file:///opt/concourse/worker/volumes/live/42fc56a4-88b4-4605-6037-9605f4e5eeeb/volume/lxml_1616443240194/work +MarkupSafe @ file:///opt/concourse/worker/volumes/live/c9141381-1dba-485b-7c96-99007bf7bcfd/volume/markupsafe_1621528150226/work +matplotlib @ file:///opt/concourse/worker/volumes/live/41e8cd50-031f-4dda-5787-dd3c4f4e0f08/volume/matplotlib-suite_1613407855571/work +mccabe==0.6.1 +mistune @ file:///opt/concourse/worker/volumes/live/95802d64-d39c-491b-74ce-b9326880ca54/volume/mistune_1594373201816/work +mkl-fft==1.3.0 +mkl-random @ file:///opt/concourse/worker/volumes/live/54b31a45-1da5-4512-5c3a-93c9ff2af8bc/volume/mkl_random_1618853970587/work +mkl-service==2.3.0 +mock @ file:///tmp/build/80754af9/mock_1607622725907/work +more-itertools @ file:///tmp/build/80754af9/more-itertools_1613676688952/work +mpmath==1.2.1 +msgpack @ file:///opt/concourse/worker/volumes/live/d7400f3a-e5de-4e85-5d4c-0c984c648401/volume/msgpack-python_1612287157185/work +multipledispatch==0.6.0 +mypy-extensions==0.4.3 +navigator-updater==0.2.1 +nbclassic @ file:///tmp/build/80754af9/nbclassic_1616085367084/work +nbclient @ file:///tmp/build/80754af9/nbclient_1614364831625/work +nbconvert @ file:///opt/concourse/worker/volumes/live/2b9c1d93-d0fd-432f-7d93-66c93d81b614/volume/nbconvert_1601914875037/work +nbformat @ file:///tmp/build/80754af9/nbformat_1617383369282/work +nest-asyncio @ file:///tmp/build/80754af9/nest-asyncio_1613680548246/work +networkx @ file:///tmp/build/80754af9/networkx_1617653298338/work +nltk @ file:///tmp/build/80754af9/nltk_1621347441292/work +nose @ file:///tmp/build/80754af9/nose_1606773131901/work +notebook @ file:///opt/concourse/worker/volumes/live/78fd3e35-67c2-490e-7bb9-0627a6db9485/volume/notebook_1621528340294/work +numba @ file:///opt/concourse/worker/volumes/live/263a950e-7ddc-4297-63df-0c284f0b6d22/volume/numba_1616774255536/work +numexpr @ file:///opt/concourse/worker/volumes/live/e845d683-bbb9-4fa2-79ce-743b84c61560/volume/numexpr_1618856522192/work +numpy @ file:///opt/concourse/worker/volumes/live/6acd2784-8443-45a5-42ec-e10d2d0eaa28/volume/numpy_and_numpy_base_1620831186338/work +numpydoc @ file:///tmp/build/80754af9/numpydoc_1605117425582/work +olefile==0.46 +openpyxl @ file:///tmp/build/80754af9/openpyxl_1615411699337/work +packaging @ file:///tmp/build/80754af9/packaging_1611952188834/work +pandas==1.2.4 +pandocfilters @ file:///opt/concourse/worker/volumes/live/c330e404-216d-466b-5327-8ce8fe854d3a/volume/pandocfilters_1605120442288/work +parso==0.7.0 +partd @ file:///tmp/build/80754af9/partd_1618000087440/work +path @ file:///opt/concourse/worker/volumes/live/6493576b-552d-426b-432c-5e0fafcd8a43/volume/path_1614022213143/work +pathlib2 @ file:///opt/concourse/worker/volumes/live/cca4007b-e85e-4f77-430e-d30b2149548d/volume/pathlib2_1607024978319/work +pathspec==0.7.0 +pathtools==0.1.2 +patsy==0.5.1 +pep8==1.7.1 +pexpect @ file:///tmp/build/80754af9/pexpect_1605563209008/work +pickleshare @ file:///tmp/build/80754af9/pickleshare_1606932040724/work +Pillow @ file:///opt/concourse/worker/volumes/live/ca23594b-6e35-4c8c-5637-50ac0b550473/volume/pillow_1617386168018/work +pkginfo==1.7.0 +pluggy @ file:///opt/concourse/worker/volumes/live/2d655872-b6f5-4225-538c-dd87e481f5c8/volume/pluggy_1615976700299/work +ply==3.11 +prometheus-client @ file:///tmp/build/80754af9/prometheus_client_1618088486455/work +prompt-toolkit @ file:///tmp/build/80754af9/prompt-toolkit_1616415428029/work +psutil @ file:///opt/concourse/worker/volumes/live/0673cd4b-30c1-4470-7490-d8955610f5d5/volume/psutil_1612298002202/work +ptyprocess @ file:///tmp/build/80754af9/ptyprocess_1609355006118/work/dist/ptyprocess-0.7.0-py2.py3-none-any.whl +py @ file:///tmp/build/80754af9/py_1607971587848/work +pycodestyle @ file:///home/ktietz/src/ci_mi/pycodestyle_1612807597675/work +pycosat==0.6.3 +pycparser @ file:///tmp/build/80754af9/pycparser_1594388511720/work +pycurl==7.43.0.6 +pydocstyle @ file:///tmp/build/80754af9/pydocstyle_1621600989141/work +pyerfa @ file:///opt/concourse/worker/volumes/live/f87989f6-b347-4e0a-7be4-c3773473f3c6/volume/pyerfa_1621560794706/work +pyflakes @ file:///home/ktietz/src/ci_ipy2/pyflakes_1612551159640/work +Pygments @ file:///tmp/build/80754af9/pygments_1621606182707/work +pylint @ file:///opt/concourse/worker/volumes/live/39bfc6db-5da7-4bc4-5240-e8a1167491ff/volume/pylint_1617135827184/work +pyls-black @ file:///tmp/build/80754af9/pyls-black_1607553132291/work +pyls-spyder @ file:///tmp/build/80754af9/pyls-spyder_1613849700860/work +PyMySQL @ file:///opt/concourse/worker/volumes/live/91bc5664-f34a-482d-7efc-f19bbd7a1347/volume/pymysql_1610482895985/work +pyodbc===4.0.0-unsupported +pyOpenSSL @ file:///tmp/build/80754af9/pyopenssl_1608057966937/work +pyparsing @ file:///home/linux1/recipes/ci/pyparsing_1610983426697/work +pyrsistent @ file:///opt/concourse/worker/volumes/live/ff11f3f0-615b-4508-471d-4d9f19fa6657/volume/pyrsistent_1600141727281/work +PySocks @ file:///opt/concourse/worker/volumes/live/85a5b906-0e08-41d9-6f59-084cee4e9492/volume/pysocks_1594394636991/work +pytest==6.2.3 +python-dateutil @ file:///home/ktietz/src/ci/python-dateutil_1611928101742/work +python-jsonrpc-server @ file:///tmp/build/80754af9/python-jsonrpc-server_1600278539111/work +python-language-server @ file:///tmp/build/80754af9/python-language-server_1607972495879/work +pytz @ file:///tmp/build/80754af9/pytz_1612215392582/work +PyWavelets @ file:///opt/concourse/worker/volumes/live/ea36e10f-66e8-43ae-511e-c4092764493f/volume/pywavelets_1601658378672/work +PyYAML==5.4.1 +pyzmq==20.0.0 +QDarkStyle==2.8.1 +QtAwesome @ file:///tmp/build/80754af9/qtawesome_1615991616277/work +qtconsole @ file:///tmp/build/80754af9/qtconsole_1616775094278/work +QtPy==1.9.0 +regex @ file:///opt/concourse/worker/volumes/live/e81f6b8c-e3b5-481b-6b5d-4fa8d9bb9405/volume/regex_1617569701251/work +requests @ file:///tmp/build/80754af9/requests_1608241421344/work +rope @ file:///tmp/build/80754af9/rope_1602264064449/work +Rtree @ file:///opt/concourse/worker/volumes/live/7b97d6e1-aeee-4f6d-418c-32be5bbd5ed3/volume/rtree_1618420839839/work +ruamel-yaml-conda @ file:///opt/concourse/worker/volumes/live/53b096c9-f5b7-4029-7f1b-056927554e08/volume/ruamel_yaml_1616016691174/work +scikit-image==0.18.1 +scikit-learn @ file:///opt/concourse/worker/volumes/live/27ad05e5-2546-4218-4550-464724750acc/volume/scikit-learn_1621370399611/work +scipy @ file:///opt/concourse/worker/volumes/live/7d10d993-3825-404e-6e5d-9947c19e8c6d/volume/scipy_1618855951189/work +seaborn @ file:///tmp/build/80754af9/seaborn_1608578541026/work +Send2Trash @ file:///tmp/build/80754af9/send2trash_1607525499227/work +simplegeneric==0.8.1 +singledispatch @ file:///tmp/build/80754af9/singledispatch_1614366001199/work +six @ file:///opt/concourse/worker/volumes/live/5b31cb27-1e37-4ca5-6e9f-86246eb206d2/volume/six_1605205320872/work +sniffio @ file:///opt/concourse/worker/volumes/live/1faeb672-6d3f-4f1f-7861-294fcf282962/volume/sniffio_1614030462215/work +snowballstemmer @ file:///tmp/build/80754af9/snowballstemmer_1611258885636/work +sortedcollections @ file:///tmp/build/80754af9/sortedcollections_1611172717284/work +sortedcontainers @ file:///tmp/build/80754af9/sortedcontainers_1606865132123/work +soupsieve @ file:///tmp/build/80754af9/soupsieve_1616183228191/work +Sphinx @ file:///tmp/build/80754af9/sphinx_1620777493457/work +sphinxcontrib-applehelp @ file:///home/ktietz/src/ci/sphinxcontrib-applehelp_1611920841464/work +sphinxcontrib-devhelp @ file:///home/ktietz/src/ci/sphinxcontrib-devhelp_1611920923094/work +sphinxcontrib-htmlhelp @ file:///home/ktietz/src/ci/sphinxcontrib-htmlhelp_1611920974801/work +sphinxcontrib-jsmath @ file:///home/ktietz/src/ci/sphinxcontrib-jsmath_1611920942228/work +sphinxcontrib-qthelp @ file:///home/ktietz/src/ci/sphinxcontrib-qthelp_1611921055322/work +sphinxcontrib-serializinghtml @ file:///home/ktietz/src/ci/sphinxcontrib-serializinghtml_1611920755253/work +sphinxcontrib-websupport @ file:///tmp/build/80754af9/sphinxcontrib-websupport_1597081412696/work +spyder @ file:///opt/concourse/worker/volumes/live/b2db1f66-1cfc-4529-6e21-61097ab49952/volume/spyder_1616775698806/work +spyder-kernels @ file:///opt/concourse/worker/volumes/live/12a19b85-7733-4e39-55a2-982abb6f0274/volume/spyder-kernels_1614030593315/work +SQLAlchemy @ file:///opt/concourse/worker/volumes/live/e51bfc47-a66d-4e2e-59af-339342299d8c/volume/sqlalchemy_1620712425990/work +statsmodels @ file:///opt/concourse/worker/volumes/live/f55add71-53cb-4510-4c08-7ae9e294a482/volume/statsmodels_1614023740731/work +sympy @ file:///opt/concourse/worker/volumes/live/0bf7d347-8399-4af8-5e6f-4267d4aabdd9/volume/sympy_1618255318469/work +tables==3.6.1 +tblib @ file:///tmp/build/80754af9/tblib_1597928476713/work +terminado==0.9.4 +testpath @ file:///home/ktietz/src/ci/testpath_1611930608132/work +textdistance @ file:///tmp/build/80754af9/textdistance_1612461398012/work +threadpoolctl @ file:///tmp/tmp9twdgx9k/threadpoolctl-2.1.0-py3-none-any.whl +three-merge @ file:///tmp/build/80754af9/three-merge_1607553261110/work +tifffile==2020.10.1 +toml @ file:///tmp/build/80754af9/toml_1616166611790/work +toolz @ file:///home/linux1/recipes/ci/toolz_1610987900194/work +tornado @ file:///opt/concourse/worker/volumes/live/05341796-4198-4ded-4a9a-332fde3cdfd1/volume/tornado_1606942323372/work +tqdm @ file:///tmp/build/80754af9/tqdm_1615925068909/work +traitlets @ file:///home/ktietz/src/ci/traitlets_1611929699868/work +typed-ast @ file:///opt/concourse/worker/volumes/live/b635b0af-0037-435d-5a8b-38d096f36bbf/volume/typed-ast_1610484559102/work +typing-extensions @ file:///home/ktietz/src/ci_mi/typing_extensions_1612808209620/work +ujson @ file:///opt/concourse/worker/volumes/live/b4182deb-c6ce-4bc8-7783-61027c162049/volume/ujson_1611259506235/work +unicodecsv==0.14.1 +urllib3 @ file:///tmp/build/80754af9/urllib3_1615837158687/work +watchdog @ file:///opt/concourse/worker/volumes/live/ba071b65-d6ec-4539-5875-0791be503584/volume/watchdog_1612471127391/work +wcwidth @ file:///tmp/build/80754af9/wcwidth_1593447189090/work +webencodings==0.5.1 +Werkzeug @ file:///home/ktietz/src/ci/werkzeug_1611932622770/work +widgetsnbextension==3.5.1 +wrapt @ file:///opt/concourse/worker/volumes/live/e3646d84-e961-4523-6bed-01532273c57e/volume/wrapt_1597851473852/work +wurlitzer @ file:///opt/concourse/worker/volumes/live/bf42cded-f988-433c-5eca-a88c3905d29b/volume/wurlitzer_1617224650715/work +xlrd @ file:///tmp/build/80754af9/xlrd_1608072521494/work +XlsxWriter @ file:///tmp/build/80754af9/xlsxwriter_1617224712951/work +xlwings==0.23.0 +xlwt==1.3.0 +xmltodict==0.12.0 +yapf @ file:///tmp/build/80754af9/yapf_1615749224965/work +zict==2.0.0 +zipp @ file:///tmp/build/80754af9/zipp_1615904174917/work +zope.event==4.5.0 +zope.interface @ file:///opt/concourse/worker/volumes/live/2b8309a3-732c-445e-5670-ab75beb9637c/volume/zope.interface_1616357204515/work diff --git a/stopWordTest.py b/stopWordTest.py new file mode 100644 index 0000000..acb19e7 --- /dev/null +++ b/stopWordTest.py @@ -0,0 +1,443 @@ +# -*- coding: utf-8 -*- +# @Author: Gree +# @Date: 2021-05-29 15:26:57 +# @Last Modified by: Gree +# @Last Modified time: 2021-05-31 13:48:14 + + +import json +import os +import time + + +# 第三方库 +import csv +import matplotlib as mpl +import numpy as np +import pandas as pd +import requests +from config import readConfig as rc +from data import readData as rd +from tqdm import tqdm +from matplotlib import pyplot as plt +from matplotlib.font_manager import FontProperties + + +class StopWordTest: + def __init__(self): + """ + __init__ 函数: + input: + output: + features: 定义初始变量,输出初始日志信息 + step1: 定义 self.dirPath、self.url、self.headers 等对象 + """ + # 输出 log 信息 + print("#" * 30, end = "") + # 输出 log 信息 + print("Loading the module of stopWordTest ....", end = "") + # 输出 log 信息 + print("#" * 30) + # 定义 self.dirPath + self.dirPath = os.path.split(os.path.realpath(__file__))[0] + "/result" + # 测试接口的 url + self.url = "https://testnlu.gree.com/regnlu/query" + # 定义请求头 + self.headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36" + } + # 定义 self.initial_list + self.initial_list = rd.ReadData().readData() + # 定义 initial_df + self.initial_df = rd.ReadData().getData(self.initial_list) + # 定义 self.stopWordList + self.stopWordList = rc.ReadConfig().readConfig() + # 定义 self.correctRateLst + self.correctRateLst = [] + # 定义 self.errorRateLst + self.errorRateLst = [] + + + def dealStopWord(self, stopWord): + """ + dealStopWord 函数: + input: stopWord + output: df + features: 处理停用词,去噪声 + step1: 进行停用词处理,去噪声 + step2: 返回经过处理后数据框 + """ + # 输出 log 信息 + print("==========================进入停用词处理============================") + # 输出 log 信息 + print("正在处理停用词:", stopWord) + + # 新建 dealStopWordList + dealStopWordList = [] + # copy self.df + df = self.initial_df.copy() + + # 遍历 + for index, row in df.iterrows(): + # 捕获异常 + try: + # 条件判断 + if stopWord in row["query"]: + # 赋值 + initial_query = row["query"] + # 字符串替换 + row["query"] = row["query"].replace(stopWord, "") + # 条件判断 + if row["query"] != "": + # 构建字典 + stopWordDict = { + "domain": row["domain"], + "intent": row["intent"], + "initial_query": initial_query, + "query": row["query"] + } + + # 列表添加元素 + dealStopWordList.append(stopWordDict) + + else: + pass + + else: + pass + + except Exception as e: + print("The error of dealing stopWord: ", e) + + # 捕获异常 + try: + # 列表转为数据框 + df = pd.DataFrame(dealStopWordList, columns = ["domain","intent", "initial_query", "query"]) + # 数据去重 + # df = df.drop_duplicates(subset = "query") + # 输出 log 信息 + print("停用词处理结束 ...") + # 输出 log 信息 + print("The dimension of df after dealing stopWord: ", end = "") + # 数据数据框的维度 + print(df.shape) + # 输出 log 信息 + print("==========================退出停用词处理============================") + # 返回数据框 + return df + + except Exception as e: + print("The error of converting list to data frame: ", e) + + + def dataTest(self, df): + """ + dataTest 函数: + input: df + output: df + features: 请求测试接口,返回测试数据 + step1: 请求测试接口:https://nlu.gree.com/test2/regnlu/query + step2: 返回 df + """ + # 输出 log 信息 + print("=========================批量发起网络请求===========================") + # 输出 log 信息 + print("Requesting url ...") + + # copy df + df = df.copy() + + # 捕获异常 + try: + # for 循环 + for k in tqdm(range(0,len(df["query"]))): + # 捕获异常 + try: + # 获取字段 + query = df["query"].iloc[k] + # 定义 para + para = {"clientId":"test_zww","sessionId":"test_zww","agentId":"YTYxMjRmZDY1NGM1","token":"bb2c960a3a8e018fab0a7b1bccfab91d3855","query":query } + # 字典转字符串 + para = json.dumps(para) + # 发起 post 请求 + r = requests.post(url = self.url, headers = self.headers, data = para, timeout = 60) + + except Exception as e: + print("The error of requesting url: ", e) + + # 捕获异常 + try: + # 数据框操作 + df.loc[k,"url_domain"] = json.loads(r.text).get("semantic").get("service") + df.loc[k,"url_intent"] = json.loads(r.text).get("semantic").get("action") + + except Exception as e: + print("The error of operating dataframe: ", e) + + except Exception as e: + print("The error of dataTest(): ", e) + + else: + # 返回 df + return df + + finally: + # 输出 log 信息 + print("The process of requesting url finished!") + print("=========================结束所有网络请求===========================") + + + def dataEvaluation(self, stopWord, df, countCorrect, countError): + """ + dataEvaluation 函数: + input: stopWord, df, countCorrect, countError + output: df + features: 对测试完的数据,进行评估 + step1: 对测试完的数据,进行评估,计算语料识别的正确率和错误率 + step2: 返回 correctRate, errorRate, rowCorrectDictLst, rowErrorDictLst + """ + # copy df + df = df.copy() + # 新建一个 list 接收元素 + rowCorrectDictLst = [] + # 新建一个 list 接收元素 + rowErrorDictLst = [] + # for 循环 + for index, row in df.iterrows(): + # 捕获异常 + try: + # 条件判断 + if row["domain"] == "chat" and row["url_domain"] != "": + # 构建字典 + rowCorrectDict = { + "domain": row["domain"], + "intent": row["intent"], + "initial_query": row["initial_query"], + "query": row["query"], + "url_domain": row["url_domain"], + "url_intent": row["url_intent"] + } + # 列表添加元素 + rowCorrectDictLst.append(rowCorrectDict) + # 计数 + countCorrect += 1 + + else: + pass + + except Exception as e: + print("The error of counting correct corpus: ", e) + + + # for 循环 + for index, row in df.iterrows(): + # 捕获异常 + try: + # 条件判断 + if row["domain"] != "chat" and row["url_domain"] != "": + # 构建字典 + rowErrorDict = { + "domain": row["domain"], + "intent": row["intent"], + "initial_query": row["initial_query"], + "query": row["query"], + "url_domain": row["url_domain"], + "url_intent": row["url_intent"] + } + rowErrorDictLst.append(rowErrorDict) + # 计数 + countError += 1 + + else: + pass + + except Exception as e: + print("The error of counting error corpus: ", e) + + # 输出 log 日志 + print("=====================计算正确识别语料数量和比率=====================") + + # 计算正确识别语料比率 + correctRate = countCorrect / df.shape[0] + # 输出 countOne + print("正确识别语料数量:", str(countCorrect)) + # 输出 correctRate + print("正确识别语料比率:", str(correctRate)) + + # 输出 log 日志 + print("=============================计算结束===============================") + + + # 输出 log 信息 + print("=====================计算错误识别语料数量和比率=====================") + + # 计算错误识别语料比率 + errorRate = countError / df.shape[0] + + # 输出 log 日志 + print("错误识别语料数量:", str(countError)) + print("错误识别语料比率:", str(errorRate)) + + # 输出 log 日志 + print("=============================计算结束===============================") + + # 返回 correctRate, errorRate + return correctRate, errorRate, rowCorrectDictLst, rowErrorDictLst + + + def WriteData(self, filePath, lst): + """ + WriteData 函数: + input: filePath, lst + output: 数据持久化 + features: 对最终数据进行数据持久化 + step1: 文件写入 + """ + # 输出 log 信息 + print("Writing file:", filePath) + # 打开文件 + with open(filePath, 'w+', encoding = "utf-8") as f: + # I/O 操作 + writer = csv.DictWriter(f, fieldnames = ("domain", "intent", "initial_query", "query", "url_domain", "url_intent")) + # 写入头 + writer.writeheader() + # for 循环 + for item in lst: + # 文件写入 + writer.writerow(item) + # 结束 I/O 操作 + f.close() + + + def drawPicture(self, correctRateLst, errorRateLst): + """ + drawPicture 函数: + input: correctRateLst, errorRateLst + output: 图形 + features: 将语料识别的正确率和错误率进行绘图 + step1: 图形绘制 + """ + # 输出 log 信息 + print("=============================正在绘图===============================") + + # 设置中文 + mpl.rcParams["font.sans-serif"] = ["Songti SC"] + mpl.rcParams["axes.unicode_minus"] = False + + # 变量 x + x = np.arange(3) + # 变量 y1 + y1 = correctRateLst + # 变量 y2 + y2 = errorRateLst + # 设置宽度 + barWidth = 0.35 + # 新建列表接收 tick_label + tickLabelLst = [] + # for 循环 + for i in range(1, len(correctRateLst) + 1): + # 变量 item + tick_label = "corpus{}".format(i) + # 列表添加元素 + tickLabelLst.append(tick_label) + + # 绘制条形图 + plt.bar(x, y1, barWidth, align = "center", color = "c", label = "正确识别语料比率", alpha = 0.5) + plt.bar(x + barWidth, y2, barWidth, color = "b", align = "center", label = "错误识别语料比率", alpha = 0.5) + + # 设置 label + plt.xlabel("相关语料") + plt.ylabel("比率") + + plt.xticks(x + barWidth/2, tickLabelLst) + plt.legend() + # 图片展示 + plt.show() + + # 输出 log 信息 + print("=============================退出绘图===============================") + + + def main(self): + """ + main 函数: + input: + output: 数据持久化、绘图 + features: 运行主体代码 + step1: 判断存放最终数据的目录是否存在,不存在则创建 + step2: 停用词处理、接口测试、文件写入、绘图 + step3: 将最终得到的数据持久化 + """ + # 输出 log 信息 + print("==========================判断文件存在性============================") + # 判断文件是否存在,若不存在则创建 + if os.path.exists(self.dirPath): + # 如果目录存在则不创建,并提示目录已存在 + print(self.dirPath + ":目录已存在") + else: + os.makedirs(self.dirPath) + print(self.dirPath + ":目录创建成功") + # 输出 log 信息 + print("=============================判断结束===============================") + # 遍历 + for stopWord in self.stopWordList: + # 停用词处理 + df = self.dealStopWord(stopWord = stopWord) + # 语料测试 + test_df = self.dataTest(df = df) + # 语料评估 + correctRate, errorRate, correctDictLst, errorDictLst = self.dataEvaluation(stopWord = stopWord, df = test_df, countCorrect = 0, countError = 0) + # 存放正确识别语料的文件路径 + correctFilePath = os.path.join(self.dirPath, stopWord + "_correct.csv") + # 存放错误识别语料的文件路径 + errorFilePath = os.path.join(self.dirPath, stopWord + "_error.csv") + # 输出 log 信息 + print("=============================写入文件===============================") + # 文件写入 + self.WriteData(correctFilePath, correctDictLst) + # 文件写入 + self.WriteData(errorFilePath, errorDictLst) + # 输出 log 信息 + print("=============================写入完成===============================") + # 列表添加元素 + self.correctRateLst.append(correctRate) + # 列表添加元素 + self.errorRateLst.append(errorRate) + + # 绘图 + self.drawPicture(self.correctRateLst, self.errorRateLst) + + + + + +if __name__ == '__main__': + stopWordTest = StopWordTest() + stopWordTest.main() + # 输出 log 信息 + print("#" * 30, end = "") + # 输出 log 信息 + print("The process of stopWordTest finished!", end = "") + # 输出 log 信息 + print("#" * 30) + + + + + + + + + + + + + + + + + + + + + -- GitLab