# -*- coding: utf-8 -*- """ @Time : 2022/12/20 10:35 @Author : @FileName: @Software: @Describe: """ import os from bs4 import BeautifulSoup import pandas as pd import re # 遍历文件夹 yuanshi = "../data/11篇yy/paperyyreduce20230221120936.html" soup_source = BeautifulSoup(open(yuanshi, encoding='utf-8'), "html.parser") yyshuju = "../data/11篇yy/paperyyreduce_result20230221120936" soup_result = BeautifulSoup(open(yyshuju, encoding='utf-8'), "html.parser") source_sentence_list = soup_source.select('p > em') result_sentence_list = soup_result.select('p > em') data = [] for sentence_index in range(len(source_sentence_list)): try: print(source_sentence_list[sentence_index]["id"]) print(result_sentence_list[sentence_index]["id"]) print(result_sentence_list[sentence_index]["class"]) if source_sentence_list[sentence_index]["id"] == result_sentence_list[sentence_index]["id"] \ and (result_sentence_list[sentence_index]["class"] == ['similar','red'] or result_sentence_list[sentence_index]["class"] == ['similar']): # if source_sentence_list[sentence_index]["id"] == result_sentence_list[sentence_index]["id"]: source_text = source_sentence_list[sentence_index].string result_text = result_sentence_list[sentence_index].string source_text = source_text.strip("\n") result_text = result_text.strip("\n") if source_text != None and result_text != None: data.append([source_text,result_text]) except: print(sentence_index) # print(data) def data_clean(text): # 清洗excel中的非法字符,都是不常见的不可显示字符,例如退格,响铃等 ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]') text = ILLEGAL_CHARACTERS_RE.sub(r'', text) return text print(data) df = pd.DataFrame(data,columns=["原文","yy降重"]) for col in df.columns: df[col] = df[col].apply(lambda x: data_clean(x)) df.to_excel("../data/11篇_yy.xlsx",index=None)