普通版降重
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

63 lines
1.8 KiB

# -*- coding: utf-8 -*-
"""
@Time : 2023/2/15 14:13
@Author :
@FileName:
@Software:
@Describe:
"""
import os
import pandas as pd
path_1 = '../data/11篇excel'
path_2 = "../data/11篇临时拼接"
path_3 = "../data/11篇临时拼接2"
path_yy = "../data/11篇_yy_strsim.xlsx"
path_t5 = "../data/11篇_t5_strsim.xlsx"
data_yy = pd.read_excel(path_yy).values.tolist()
data_t5 = pd.read_excel(path_t5).values.tolist()
data_yy_dict = {}
data_t5_dict = {}
for i in data_yy:
str_data_yuan = str(i[0]).strip("").strip()
str_data_lable = str(i[1]).strip("").strip()
data_yy_dict[str_data_yuan] = str_data_lable
for i in data_t5:
str_data_yuan = str(i[0]).strip("").strip()
str_data_lable = str(i[1]).strip("").strip()
data_t5_dict[str_data_yuan] = str_data_lable
path_list = []
for file_name in os.listdir(path_1):
path_list.append(file_name)
for file_name in path_list:
data_new = []
data_1 = pd.read_excel(path_1 + "/" + file_name).values.tolist()
data_2 = pd.read_excel(path_2 + "/" + file_name).values.tolist()
file_name_0 = file_name.split(".")[0]
file_name_1 = file_name.split(".")[1]
file_name_ = file_name_0 + "_." + file_name_1
data_3 = pd.read_excel(path_3 + "/" + file_name_).values.tolist()
for i in range(len(data_1)):
# print(data_1[i])
if data_1[i][0] == "":
continue
str_data = str(data_1[i][0]).strip()
try:
data_t5_dan = data_t5_dict[str_data]
data_yy_dan = data_yy_dict[str_data]
data_new.append(data_1[i] + [data_2[i][1], data_3[i][1], data_t5_dan, data_yy_dan])
except:
print(str_data)
df = pd.DataFrame(data_new,columns=["原文","simbert","simbert_datasim07","bertsim_simsim","t5","yy"])
df.to_excel("../data/11篇测试excel_汇总_3/{}.xlsx".format(file_name_0), index=None)