# -*- coding: utf-8 -*- # author:Li Mingjie time:2019/1/24 # Brief:process unisound logfile import json import re import pandas as pd import threading import bottom_function.m_SQL as qb import datetime as dt import paramiko import os import time import bottom_function.data_read as dr class timing_processing: def __init__(self): self.data = pd.DataFrame() self.datetime = pd.Timestamp("2019-01-01 00:00:00") self.db = qb.Schema(host="localhost", user="560193", password="jay560193", mysqlName="semantic_data_schema", port="3306") self.gree_list = ["aircleaner", "airconditioner", "airconditionerfan", "airsteward", "curtain", "dehumidifier", "disinfection", "fanner", "furnace", "humidifier", "playcontrol", "refrigerator", "ricecooker", "smokelampblackmachine", "universalcontrol", "ventilation", "washingmachine", "waterheater"] self.tencent_list = ["almanac", "ancient_poem", "astro", "baike", "chat", "chengyu", "common_qa", "finance", "fm", "food", "general_question_answering", "history", "holiday", "joke", "music", "news", "recipe", "science", "sound", "sports", "stock", "translate", "weather"] def data_storage(self): os.system("sh /home/work/semantic_platform_DAS/bottom_function/data/sortLog.sh") data = open('/home/work/semantic_platform_DAS/bottom_function/data/semantic/order/semantic.txt', 'r', encoding='utf-8').readlines() if len(data) == 0: print("data is null") datetime_data = [] macwifi_data = [] macvoice_data = [] query_data = [] classify_data = [] code_data = [] domain_data = [] intent_data = [] response_data = [] costtime_data = [] error_data = [] error_num = [] n = 1 for line_data in data: if line_data == '': continue line_data = line_data.strip('\n') data1 = str(line_data).lower() data1 = data1.replace('[ctoc query log]', '{"ctoc query log":{"time":"') data1 = data1 + '"}}' data1 = data1.replace('request:', '","request_m":"') data1 = data1.replace('{"reqparam":', '","request":{"reqparam":') # data1 = data1.replace('true', '"true"').replace('false', '"false"') data1 = data1.replace("\'", "") # data1 = data1.replace('\\', '') data1 = data1.replace('response:', '"response":').replace('costtime:', ',"costtime":"') data1 = data1.replace('\t', '') try: js_data = json.loads(data1) except: error_data.append(line_data) try: dom = 'null' inte = 'null' resp = 'null' code = 0 macw = 'null' macv = 'null' datetime = js_data['ctoc query log']['time'] qu = js_data['ctoc query log']['request']['reqparam']['nluret']['asr_recongize'] qu = re.sub(',', '', qu) if 'status' in js_data['ctoc query log']['response']: cla = 'control' else: cla = 'application' if cla == 'application': dom = js_data['ctoc query log']['response']['header']['semantic']['domain'] inte = js_data['ctoc query log']['response']['header']['semantic']['intent'] if js_data['ctoc query log']['response']['response_text'] is not None: resp = str(js_data['ctoc query log']['response']['response_text']) resp = resp.replace('\n', '').replace(' ', '') code = js_data['ctoc query log']['response']['header']['semantic']['code'] else: code = js_data['ctoc query log']['response']['code'] dom = inte = resp = js_data['ctoc query log']['response']['errortype'] if cla == 'control': code = js_data['ctoc query log']['response']['status']['code'] errort = js_data['ctoc query log']['response']['status']['errortype'] resp = errort if code == 0: dom = js_data['ctoc query log']['response']['semantic']['service'] inte = js_data['ctoc query log']['response']['semantic']['action'] else: dom = errort inte = errort costt = str(js_data['ctoc query log']['costtime']) costt = str(costt.replace('ms', '')) if 's' in costt: m_cost = float(costt.split('s', 1)[0]) * 1000 else: m_cost = float(costt) par = js_data['ctoc query log']['request']['reqparam']['common']['trafficparameter'] par_list = par.split(';') for m_par in par_list: if 'macwifi' in m_par: macw = m_par.replace('macwifi=', '') elif 'macvoice' in m_par: macv = m_par.replace('macvoice=', '') datetime_data.append(datetime) macwifi_data.append(macw) macvoice_data.append(macv) query_data.append(qu) classify_data.append(cla) code_data.append(code) domain_data.append(dom) intent_data.append(inte) response_data.append(resp) costtime_data.append(m_cost) except: error_data.append(line_data) error_num.append(n) n += 1 outdata = pd.DataFrame( {'datetime': datetime_data, 'mac_wifi': macwifi_data, 'mac_voice': macvoice_data, 'query': query_data, 'classify': classify_data, 'code': code_data, 'domain': domain_data, 'intent': intent_data, 'response_data': response_data, 'cost_time_ms': costtime_data}) #errordata = pd.DataFrame({'num': error_num, 'data': error_data}) outdata['datetime'] = pd.to_datetime(outdata['datetime']) outdata = outdata.sort_values(by=['datetime']) # print(outdata['datetime']) self.datetime = time.strftime("%Y-%m-%d %H ", time.localtime(time.time())) outdata = outdata.set_index('datetime') self.data = outdata control_error_data = outdata[(outdata['classify'] == 'control') & (outdata['code'] != 0)] application_error_data = outdata[(outdata['classify'] == 'application') & (outdata['code'] != 0)] # control_error_data.drop_duplicates(subset='query', keep='first', inplace=True) # application_error_data.drop_duplicates(subset='query', keep='first', inplace=True) self.db.dataframeToMysql(data=outdata, tableName="semantic_data_table") self.db.dataframeToMysql(data=control_error_data, tableName="control_error_data") self.db.dataframeToMysql(data=application_error_data, tableName="application_error_data") #errordata.to_csv('./bottom_function/data/fromat_error_data.csv') # self.domain_data_to_statistics(control_data, data_type="control") # self.domain_data_to_statistics(application_data, data_type="application") # self.costtime_data_to_statistics(outdata) print('storage the data to SQL is complete') def domain_data_to_statistics(self, data, data_type): print('Start domain data classification:') data_dict_domain = {} if data_type == 'control': data_dict_domain = {"datetime": self.datetime, "aircleaner": 0, "airconditioner": 0, "airconditionerfan": 0, "airsteward": 0, "curtain": 0, "dehumidifier": 0, "disinfection": 0, "fanner": 0, "furnace": 0, "humidifier": 0, "playcontrol": 0, "refrigerator": 0, "ricecooker": 0, "smokelampblackmachine": 0, "universalcontrol": 0, "ventilation": 0, "washingmachine": 0, "waterheater": 0} table_name = "control_domain_data" self.gree_list = self.db.GetField(tableName=table_name) domain_list = self.gree_list elif data_type == 'application': data_dict_domain = {"datetime": self.datetime, "almanac": 0, "ancient_poem": 0, "astro": 0, "baike": 0, "chat": 0, "chengyu": 0, "common_qa": 0, "finance": 0, "fm": 0, "food": 0, "general_question_answering": 0, "history": 0, "holiday": 0, "joke": 0, "music": 0, "news": 0, "recipe": 0, "science": 0, "sound": 0, "sports": 0, "stock": 0, "translate": 0, "weather": 0} table_name = "application_domain_data" self.tencent_list = self.db.GetField(tableName=table_name) domain_list = self.tencent_list else: print("data_type is error,you must chose control or application.") return -1 sm_data = data for domain_data in sm_data['domain']: if domain_data in data_dict_domain.keys(): data_dict_domain[domain_data] = data_dict_domain[domain_data] + 1 else: data_dict_domain.update({domain_data: 1}) if domain_data not in domain_list: if data_type == 'control': self.gree_list.append(domain_data) if data_type == 'application': self.tencent_list.append(domain_data) self.db.setAddField(tableName=table_name, field=domain_data) data_dict_domain['datetime'] = self.datetime aldtaframe = pd.DataFrame(data_dict_domain, index=[0]) aldtaframe['datetime'] = pd.to_datetime(aldtaframe['datetime']) aldtaframe = aldtaframe.set_index('datetime') self.db.dataframeToMysql(data=aldtaframe, tableName=table_name) print("Complete write") def costtime_data_to_statistics(self, data): print('Start cost time data statistics:') all_data_dict = {} gree_list = self.gree_list tencent_list = self.tencent_list all_data_dict = {"datetime": self.datetime, "0~500ms": "0", "500~1000ms": "0", "1000~2000ms": "0", "2000~3000ms": "0", "3000~5000ms": "0", "morethan5000ms": "0", } all_tencent_dict = {} all_gree_dict = {} gree_data_dict = {} tencent_data_dict = {} tencent_data = data[data['classify'] == 'application'] gree_data = data[data['classify'] == 'control'] for dom1 in tencent_list: num1 = tencent_data.loc[(tencent_data['cost_time_ms'] >= 0) & (tencent_data['cost_time_ms'] < 500) & ( tencent_data['domain'] == dom1), ['domain', 'cost_time_ms']].domain.count() tencent_data_dict.update({dom1: num1}) sort_data_list = sorted(tencent_data_dict.items(), key=lambda item: item[1], reverse=True) if len(sort_data_list) >= 3: all_tencent_dict.update({"tencent1": sort_data_list[:3]}) else: all_tencent_dict.update({"tencent1": sort_data_list}) tencent_data_dict.clear() sort_data_list.clear() for dom2 in tencent_list: num1 = tencent_data.loc[(tencent_data['cost_time_ms'] >= 500) & (tencent_data['cost_time_ms'] < 1000) & ( tencent_data['domain'] == dom2), ['domain', 'cost_time_ms']].domain.count() tencent_data_dict.update({dom2: num1}) sort_data_list = sorted(tencent_data_dict.items(), key=lambda item: item[1], reverse=True) if len(sort_data_list) >= 3: all_tencent_dict.update({"tencent2": sort_data_list[:3]}) else: all_tencent_dict.update({"tencent2": sort_data_list}) tencent_data_dict.clear() sort_data_list.clear() for dom3 in tencent_list: num1 = tencent_data.loc[(tencent_data['cost_time_ms'] >= 1000) & (tencent_data['cost_time_ms'] < 2000) & ( tencent_data['domain'] == dom3), ['domain', 'cost_time_ms']].domain.count() tencent_data_dict.update({dom3: num1}) sort_data_list = sorted(tencent_data_dict.items(), key=lambda item: item[1], reverse=True) if len(sort_data_list) >= 3: all_tencent_dict.update({"tencent3": sort_data_list[:3]}) else: all_tencent_dict.update({"tencent3": sort_data_list}) tencent_data_dict.clear() sort_data_list.clear() for dom4 in tencent_list: num1 = tencent_data.loc[(tencent_data['cost_time_ms'] >= 2000) & (tencent_data['cost_time_ms'] < 3000) & ( tencent_data['domain'] == dom4), ['domain', 'cost_time_ms']].domain.count() tencent_data_dict.update({dom4: num1}) sort_data_list = sorted(tencent_data_dict.items(), key=lambda item: item[1], reverse=True) if len(sort_data_list) >= 3: all_tencent_dict.update({"tencent4": sort_data_list[:3]}) else: all_tencent_dict.update({"tencent4": sort_data_list}) tencent_data_dict.clear() sort_data_list.clear() for dom5 in tencent_list: num1 = tencent_data.loc[(tencent_data['cost_time_ms'] >= 3000) & (tencent_data['cost_time_ms'] < 5000) & ( tencent_data['domain'] == dom5), ['domain', 'cost_time_ms']].domain.count() tencent_data_dict.update({dom5: num1}) sort_data_list = sorted(tencent_data_dict.items(), key=lambda item: item[1], reverse=True) if len(sort_data_list) >= 3: all_tencent_dict.update({"tencent5": sort_data_list[:3]}) else: all_tencent_dict.update({"tencent5": sort_data_list}) tencent_data_dict.clear() sort_data_list.clear() for dom6 in tencent_list: num1 = tencent_data.loc[ (tencent_data['cost_time_ms'] >= 5000) & (tencent_data['domain'] == dom6), ['domain', 'cost_time_ms']].domain.count() tencent_data_dict.update({dom6: num1}) sort_data_list = sorted(tencent_data_dict.items(), key=lambda item: item[1], reverse=True) if len(sort_data_list) >= 3: all_tencent_dict.update({"tencent6": sort_data_list[:3]}) else: all_tencent_dict.update({"tencent6": sort_data_list}) tencent_data_dict.clear() sort_data_list.clear() for gom1 in gree_list: num1 = gree_data.loc[ (gree_data['cost_time_ms'] >= 0) & (gree_data['cost_time_ms'] < 500) & (gree_data['domain'] == gom1), [ 'domain', 'cost_time_ms']].domain.count() gree_data_dict.update({gom1: num1}) sort_data_list = sorted(gree_data_dict.items(), key=lambda item: item[1], reverse=True) if len(sort_data_list) >= 3: all_gree_dict.update({"gree1": sort_data_list[:3]}) else: all_gree_dict.update({"gree1": sort_data_list}) gree_data_dict.clear() sort_data_list.clear() for gom2 in gree_list: num1 = gree_data.loc[(gree_data['cost_time_ms'] >= 500) & (gree_data['cost_time_ms'] < 1000) & ( gree_data['domain'] == gom2), ['domain', 'cost_time_ms']].domain.count() gree_data_dict.update({gom2: num1}) sort_data_list = sorted(gree_data_dict.items(), key=lambda item: item[1], reverse=True) if len(sort_data_list) >= 3: all_gree_dict.update({"gree2": sort_data_list[:3]}) else: all_gree_dict.update({"gree2": sort_data_list}) gree_data_dict.clear() sort_data_list.clear() for gom3 in gree_list: num1 = gree_data.loc[(gree_data['cost_time_ms'] >= 1000) & (gree_data['cost_time_ms'] < 2000) & ( gree_data['domain'] == gom3), ['domain', 'cost_time_ms']].domain.count() gree_data_dict.update({gom3: num1}) sort_data_list = sorted(gree_data_dict.items(), key=lambda item: item[1], reverse=True) if len(sort_data_list) >= 3: all_gree_dict.update({"gree3": sort_data_list[:3]}) else: all_gree_dict.update({"gree3": sort_data_list}) gree_data_dict.clear() sort_data_list.clear() for gom4 in gree_list: num1 = gree_data.loc[(gree_data['cost_time_ms'] >= 2000) & (gree_data['cost_time_ms'] < 3000) & ( gree_data['domain'] == gom4), ['domain', 'cost_time_ms']].domain.count() gree_data_dict.update({gom4: num1}) sort_data_list = sorted(gree_data_dict.items(), key=lambda item: item[1], reverse=True) if len(sort_data_list) >= 3: all_gree_dict.update({"gree4": sort_data_list[:3]}) else: all_gree_dict.update({"gree4": sort_data_list}) gree_data_dict.clear() sort_data_list.clear() for gom5 in gree_list: num1 = gree_data.loc[(gree_data['cost_time_ms'] >= 3000) & (gree_data['cost_time_ms'] < 5000) & ( gree_data['domain'] == gom5), ['domain', 'cost_time_ms']].domain.count() gree_data_dict.update({gom5: num1}) sort_data_list = sorted(gree_data_dict.items(), key=lambda item: item[1], reverse=True) if len(sort_data_list) >= 3: all_gree_dict.update({"gree5": sort_data_list[:3]}) else: all_gree_dict.update({"gree5": sort_data_list}) gree_data_dict.clear() sort_data_list.clear() for gom6 in gree_list: num1 = gree_data.loc[(gree_data['cost_time_ms'] >= 5000) & (gree_data['domain'] == gom6), ['domain', 'cost_time_ms']].domain.count() gree_data_dict.update({gom6: num1}) sort_data_list = sorted(gree_data_dict.items(), key=lambda item: item[1], reverse=True) if len(sort_data_list) >= 3: all_gree_dict.update({"gree6": sort_data_list[:3]}) else: all_gree_dict.update({"gree6": sort_data_list}) gree_data_dict.clear() sort_data_list.clear() for c, g, t in zip(list(all_data_dict.keys())[1:], all_gree_dict.values(), all_tencent_dict.values()): all_data_dict[c] = str(g + t) all_data_dict['datetime'] = self.datetime aldtaframe = pd.DataFrame([all_data_dict]) aldtaframe['datetime'] = pd.to_datetime(aldtaframe['datetime']) aldtaframe = aldtaframe.set_index('datetime') self.db.dataframeToMysql(data=aldtaframe, tableName="cost_time_data") print("Complete write") def run(self): self.data_storage() pr_datetime = time.strftime("%Y-%m-%d %H ", time.localtime(time.time() - 3600)) pr_datetime = str(pd.to_datetime(pr_datetime)) nw_datetime = str(pd.to_datetime(self.datetime)) print("时间为:### %s" % pr_datetime) print("时间为:### %s" % nw_datetime) cdata = dr.read_data(datatype='control', starttime=pr_datetime, endtime=nw_datetime) adata = dr.read_data(datatype='application', starttime=pr_datetime, endtime=nw_datetime) print("all****%d\n" % (len(cdata))) print("all****%d\n" % (len(adata))) controldata = cdata[cdata['code'] == 0] applicationdata = adata[adata['code'] == 0] print("****%d\n" % (len(controldata))) print("****%d\n" % (len(applicationdata))) self.domain_data_to_statistics(data=controldata, data_type="control") self.domain_data_to_statistics(data=applicationdata, data_type="application") self.costtime_data_to_statistics(data=self.data) def load_run(): print ("*****datetime:") print(time.strftime("%Y-%m-%d %H ", time.localtime(time.time()))) print ("*****\n") TP = timing_processing() TP.run() load_run() # st = pd.Timestamp("2018-12-01 00:00:00") # et = pd.Timestamp("2019-01-01 00:00:00") # u = (et - st).days * 24 # for i in range(u): # print("提取第 %d 小时" % i) # TP = timing_processing() # start_time = st + dt.timedelta(hours=i) # end_time = start_time + dt.timedelta(hours=1) # TP.datetime = end_time # TP.data = TP.db.getData(tableName='semantic_data_table', startTime=str(start_time), # endTime=str(end_time)) # data = TP.data # controldata = data[(data['classify'] == 'gree') & (data['code'] == 0)] # applicationdata = data[(data['classify'] == 'tencent') & (data['code'] == 0)] # TP.domain_data_to_statistics(data=controldata, data_type="control") # TP.domain_data_to_statistics(data=applicationdata, data_type="application") # TP.costtime_data_to_statistics(data=data) # TP = timing_processing() # TP.data_storage()