You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
43 lines
1.1 KiB
43 lines
1.1 KiB
![]()
2 years ago
|
# -*- coding: utf-8 -*-
|
||
|
|
||
|
"""
|
||
|
@Time : 2022/12/20 17:56
|
||
|
@Author :
|
||
|
@FileName:
|
||
|
@Software:
|
||
|
@Describe:
|
||
|
"""
|
||
|
import pandas as pd
|
||
|
from tqdm import tqdm
|
||
|
import json
|
||
|
|
||
|
|
||
|
path = "../data/论文_yy_小说_1.xlsx"
|
||
|
df_list = pd.read_excel(path).values.tolist()
|
||
|
|
||
|
|
||
|
def sentence_do(source,result):
|
||
|
source = str(source)
|
||
|
result = str(result)
|
||
|
if source == "nan" or result == "nan":
|
||
|
return False, source,result
|
||
|
if len(source) > 120 or len(result) > 120:
|
||
|
return False, source,result
|
||
|
else:
|
||
|
source = str(source).replace("\t", "").replace(" ", "").replace("", "").replace("", "")
|
||
|
result = str(result).replace("\t", "").replace(" ", "").replace("", "").replace("", "")
|
||
|
return True, source, result
|
||
|
|
||
|
|
||
|
df_list_new = []
|
||
|
for i in df_list:
|
||
|
source = i[0]
|
||
|
result = i[1]
|
||
|
bool, source, result = sentence_do(source, result)
|
||
|
if bool == False:
|
||
|
continue
|
||
|
else:
|
||
|
df_list_new.append([source,result])
|
||
|
|
||
|
df = pd.DataFrame(df_list_new, columns=["原文","yy降重"])
|
||
|
df.to_excel("../data/论文_yy_小说_3.xlsx",index=None)
|