普通版降重
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

87 lines
2.7 KiB

# -*- coding: utf-8 -*-
"""
@Time : 2023/2/1 19:18
@Author :
@FileName:
@Software:
@Describe:
"""
import os
import os
from bs4 import BeautifulSoup
import pandas as pd
import re
from tqdm import tqdm
# 遍历文件夹
data_path_list = []
def walkFile(file):
for root, dirs, files in os.walk(file):
# root 表示当前访问的文件夹路径
# dirs 表示该文件夹下的子目录名list
# files 表示该文件夹下的文件list
# 遍历文件
for f in files:
# print(os.path.join(root, f))
data_path_list.append(os.path.join(root, f))
# 遍历所有的文件夹
# for d in dirs:
# print(os.path.join(root, d))
def main():
walkFile("../data/yy_reduce_data_20230210-20230718")
main()
data = []
rootpath_list = []
for i in data_path_list:
danpath_list = str(i).split("/")
rootpath_list.append("/".join(danpath_list[:-1]))
print(len(rootpath_list))
rootpath_list = list(set(rootpath_list))
for i in tqdm(rootpath_list):
try:
soup_source = BeautifulSoup(open("{}/source".format(i), encoding='utf-8'),
"html.parser")
soup_result = BeautifulSoup(open("{}/result".format(i), encoding='utf-8'),
"html.parser")
except:
continue
source_sentence_list = soup_source.select('p > em')
result_sentence_list = soup_result.select('p > em')
for sentence_index in range(len(source_sentence_list)):
try:
if source_sentence_list[sentence_index]["id"] == result_sentence_list[sentence_index]["id"] \
and result_sentence_list[sentence_index]["class"] == ['similar']:
# if source_sentence_list[sentence_index]["id"] == result_sentence_list[sentence_index]["id"]:
source_text = source_sentence_list[sentence_index].string
result_text = result_sentence_list[sentence_index].string
if source_text != None and result_text != None:
data.append([source_sentence_list[sentence_index].string, result_sentence_list[sentence_index].string])
except:
pass
# print(i,sentence_index)
# print(data)
def data_clean(text):
# 清洗excel中的非法字符,都是不常见的不可显示字符,例如退格,响铃等
ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]')
text = ILLEGAL_CHARACTERS_RE.sub(r'', text)
return text
df = pd.DataFrame(data,columns=["原文","yy降重"])
for col in df.columns:
df[col] = df[col].apply(lambda x: data_clean(x))
df.to_csv("../data/论文_yy_小说_1.csv",index=None)