You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
75 lines
2.3 KiB
75 lines
2.3 KiB
![]()
2 years ago
|
# -*- coding: utf-8 -*-
|
||
|
|
||
|
"""
|
||
|
@Time : 2022/12/20 10:35
|
||
|
@Author :
|
||
|
@FileName:
|
||
|
@Software:
|
||
|
@Describe:
|
||
|
"""
|
||
|
import os
|
||
|
from bs4 import BeautifulSoup
|
||
|
import pandas as pd
|
||
|
import re
|
||
|
# 遍历文件夹
|
||
|
|
||
|
|
||
|
path_list = []
|
||
|
|
||
|
def walkFile(file):
|
||
|
for root, dirs, files in os.walk(file):
|
||
|
# root 表示当前访问的文件夹路径
|
||
|
# dirs 表示该文件夹下的子目录名list
|
||
|
# files 表示该文件夹下的文件list
|
||
|
# 遍历文件
|
||
|
for f in files:
|
||
|
name = str(f).split("_")[0]
|
||
|
path_list.append(name)
|
||
|
|
||
|
|
||
|
walkFile("../data/yy_reduce_data")
|
||
|
|
||
|
|
||
|
path_list = list(set(path_list))
|
||
|
print(path_list)
|
||
|
|
||
|
|
||
|
data = []
|
||
|
for i in path_list:
|
||
|
|
||
|
soup_source = BeautifulSoup(open("../data/yy_reduce_data/{}_source".format(i), encoding='utf-8'),
|
||
|
"html.parser")
|
||
|
|
||
|
soup_result = BeautifulSoup(open("../data/yy_reduce_data/{}_result".format(i), encoding='utf-8'),
|
||
|
"html.parser")
|
||
|
|
||
|
source_sentence_list = soup_source.select('p > em')
|
||
|
result_sentence_list = soup_result.select('p > em')
|
||
|
for sentence_index in range(len(source_sentence_list)):
|
||
|
try:
|
||
|
if source_sentence_list[sentence_index]["id"] == result_sentence_list[sentence_index]["id"] \
|
||
|
and result_sentence_list[sentence_index]["class"] == ['similar']:
|
||
|
# if source_sentence_list[sentence_index]["id"] == result_sentence_list[sentence_index]["id"]:
|
||
|
source_text = source_sentence_list[sentence_index].string
|
||
|
result_text = result_sentence_list[sentence_index].string
|
||
|
if source_text != None and result_text != None:
|
||
|
data.append([source_sentence_list[sentence_index].string, result_sentence_list[sentence_index].string])
|
||
|
except:
|
||
|
print(i,sentence_index)
|
||
|
|
||
|
# print(data)
|
||
|
|
||
|
|
||
|
def data_clean(text):
|
||
|
# 清洗excel中的非法字符,都是不常见的不可显示字符,例如退格,响铃等
|
||
|
ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]')
|
||
|
text = ILLEGAL_CHARACTERS_RE.sub(r'', text)
|
||
|
return text
|
||
|
|
||
|
|
||
|
df = pd.DataFrame(data,columns=["原文","yy降重"])
|
||
|
for col in df.columns:
|
||
|
df[col] = df[col].apply(lambda x: data_clean(x))
|
||
|
|
||
|
df.to_excel("../data/论文_yy_小说.xlsx",index=None)
|