You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
33 lines
860 B
33 lines
860 B
![]()
2 years ago
|
# -*- coding: utf-8 -*-
|
||
|
|
||
|
"""
|
||
|
@Time : 2023/2/27 18:24
|
||
|
@Author :
|
||
|
@FileName:
|
||
|
@Software:
|
||
|
@Describe:
|
||
|
"""
|
||
|
import pandas as pd
|
||
|
import difflib
|
||
|
|
||
|
file = "../data/11篇汇总txt_new_predict_t5.txt"
|
||
|
try:
|
||
|
with open(file, 'r', encoding="utf-8") as f:
|
||
|
lines = [x.strip() for x in f if x.strip() != '']
|
||
|
except:
|
||
|
with open(file, 'r', encoding="gbk") as f:
|
||
|
lines = [x.strip() for x in f if x.strip() != '']
|
||
|
|
||
|
data_new = []
|
||
|
for i in lines:
|
||
|
data_dan = i.split("\t")
|
||
|
if len(data_dan) != 2:
|
||
|
continue
|
||
|
data_1 = data_dan[0]
|
||
|
data_2 = data_dan[1]
|
||
|
str_sim_value = difflib.SequenceMatcher(None, data_1, data_2).quick_ratio()
|
||
|
data_new.append(data_dan + [str_sim_value])
|
||
|
print(data_new)
|
||
|
data_new = sorted(data_new, key= lambda x:x[2], reverse=True)
|
||
|
df = pd.DataFrame(data_new)
|
||
|
df.to_excel("../data/11篇_t5_strsim.xlsx", index=None)
|