You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
"""
|
|
|
|
@Time : 2022/12/20 17:56
|
|
|
|
@Author :
|
|
|
|
@FileName:
|
|
|
|
@Software:
|
|
|
|
@Describe:
|
|
|
|
"""
|
|
|
|
import pandas as pd
|
|
|
|
from tqdm import tqdm
|
|
|
|
import json
|
|
|
|
|
|
|
|
|
|
|
|
path = "../data/论文_yy_小说_1.csv"
|
|
|
|
df_list = pd.read_csv(path).values.tolist()
|
|
|
|
|
|
|
|
|
|
|
|
def sentence_do(source,result):
|
|
|
|
source = str(source)
|
|
|
|
result = str(result)
|
|
|
|
if source == "nan" or result == "nan":
|
|
|
|
return False, source,result
|
|
|
|
if len(source) > 120 or len(result) > 120:
|
|
|
|
return False, source,result
|
|
|
|
else:
|
|
|
|
source = str(source).replace("\t", "").replace(" ", "").replace("", "").replace("", "")
|
|
|
|
result = str(result).replace("\t", "").replace(" ", "").replace("", "").replace("", "")
|
|
|
|
return True, source, result
|
|
|
|
|
|
|
|
|
|
|
|
df_list_new = []
|
|
|
|
for i in df_list:
|
|
|
|
source = i[0]
|
|
|
|
result = i[1]
|
|
|
|
bool, source, result = sentence_do(source, result)
|
|
|
|
if bool == False:
|
|
|
|
continue
|
|
|
|
else:
|
|
|
|
df_list_new.append([source,result])
|
|
|
|
|
|
|
|
df = pd.DataFrame(df_list_new, columns=["原文","yy降重"])
|
|
|
|
df.to_csv("../data/论文_yy_小说_3.csv",index=None)
|