You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
63 lines
1.8 KiB
63 lines
1.8 KiB
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
@Time : 2023/2/15 14:13
|
|
@Author :
|
|
@FileName:
|
|
@Software:
|
|
@Describe:
|
|
"""
|
|
import os
|
|
import pandas as pd
|
|
path_1 = '../data/11篇excel'
|
|
path_2 = "../data/11篇临时拼接"
|
|
path_3 = "../data/11篇临时拼接2"
|
|
path_yy = "../data/11篇_yy_strsim.xlsx"
|
|
path_t5 = "../data/11篇_t5_strsim.xlsx"
|
|
|
|
|
|
data_yy = pd.read_excel(path_yy).values.tolist()
|
|
data_t5 = pd.read_excel(path_t5).values.tolist()
|
|
data_yy_dict = {}
|
|
data_t5_dict = {}
|
|
for i in data_yy:
|
|
str_data_yuan = str(i[0]).strip("。").strip()
|
|
str_data_lable = str(i[1]).strip("。").strip()
|
|
data_yy_dict[str_data_yuan] = str_data_lable
|
|
|
|
for i in data_t5:
|
|
str_data_yuan = str(i[0]).strip("。").strip()
|
|
str_data_lable = str(i[1]).strip("。").strip()
|
|
data_t5_dict[str_data_yuan] = str_data_lable
|
|
|
|
|
|
|
|
path_list = []
|
|
for file_name in os.listdir(path_1):
|
|
path_list.append(file_name)
|
|
|
|
|
|
for file_name in path_list:
|
|
data_new = []
|
|
data_1 = pd.read_excel(path_1 + "/" + file_name).values.tolist()
|
|
data_2 = pd.read_excel(path_2 + "/" + file_name).values.tolist()
|
|
file_name_0 = file_name.split(".")[0]
|
|
file_name_1 = file_name.split(".")[1]
|
|
file_name_ = file_name_0 + "_." + file_name_1
|
|
data_3 = pd.read_excel(path_3 + "/" + file_name_).values.tolist()
|
|
for i in range(len(data_1)):
|
|
# print(data_1[i])
|
|
if data_1[i][0] == "。":
|
|
continue
|
|
|
|
str_data = str(data_1[i][0]).strip()
|
|
try:
|
|
data_t5_dan = data_t5_dict[str_data]
|
|
data_yy_dan = data_yy_dict[str_data]
|
|
data_new.append(data_1[i] + [data_2[i][1], data_3[i][1], data_t5_dan, data_yy_dan])
|
|
except:
|
|
print(str_data)
|
|
|
|
df = pd.DataFrame(data_new,columns=["原文","simbert","simbert_datasim07","bertsim_simsim","t5","yy"])
|
|
df.to_excel("../data/11篇测试excel_汇总_3/{}.xlsx".format(file_name_0), index=None)
|
|
|
|
|