普通版降重
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

64 lines
2.1 KiB

# -*- coding: utf-8 -*-
"""
@Time : 2022/12/20 10:35
@Author :
@FileName:
@Software:
@Describe:
"""
import os
from bs4 import BeautifulSoup
import pandas as pd
import re
# 遍历文件夹
yuanshi = "../data/11篇yy/paperyyreduce20230221120936.html"
soup_source = BeautifulSoup(open(yuanshi, encoding='utf-8'),
"html.parser")
yyshuju = "../data/11篇yy/paperyyreduce_result20230221120936"
soup_result = BeautifulSoup(open(yyshuju, encoding='utf-8'),
"html.parser")
source_sentence_list = soup_source.select('p > em')
result_sentence_list = soup_result.select('p > em')
data = []
for sentence_index in range(len(source_sentence_list)):
try:
print(source_sentence_list[sentence_index]["id"])
print(result_sentence_list[sentence_index]["id"])
print(result_sentence_list[sentence_index]["class"])
if source_sentence_list[sentence_index]["id"] == result_sentence_list[sentence_index]["id"] \
and (result_sentence_list[sentence_index]["class"] == ['similar','red']
or result_sentence_list[sentence_index]["class"] == ['similar']):
# if source_sentence_list[sentence_index]["id"] == result_sentence_list[sentence_index]["id"]:
source_text = source_sentence_list[sentence_index].string
result_text = result_sentence_list[sentence_index].string
source_text = source_text.strip("\n")
result_text = result_text.strip("\n")
if source_text != None and result_text != None:
data.append([source_text,result_text])
except:
print(sentence_index)
# print(data)
def data_clean(text):
# 清洗excel中的非法字符,都是不常见的不可显示字符,例如退格,响铃等
ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]')
text = ILLEGAL_CHARACTERS_RE.sub(r'', text)
return text
print(data)
df = pd.DataFrame(data,columns=["原文","yy降重"])
for col in df.columns:
df[col] = df[col].apply(lambda x: data_clean(x))
df.to_excel("../data/11篇_yy.xlsx",index=None)