# -*- coding: utf-8 -*- """ @Time : 2022/12/20 10:35 @Author : @FileName: @Software: @Describe: """ import os from bs4 import BeautifulSoup import pandas as pd import re # 遍历文件夹 path_list = [] def walkFile(file): for root, dirs, files in os.walk(file): # root 表示当前访问的文件夹路径 # dirs 表示该文件夹下的子目录名list # files 表示该文件夹下的文件list # 遍历文件 for f in files: name = str(f).split("_")[0] path_list.append(name) walkFile("../data/yy_reduce_data") path_list = list(set(path_list)) print(path_list) data = [] for i in path_list: soup_source = BeautifulSoup(open("../data/yy_reduce_data/{}_source".format(i), encoding='utf-8'), "html.parser") soup_result = BeautifulSoup(open("../data/yy_reduce_data/{}_result".format(i), encoding='utf-8'), "html.parser") source_sentence_list = soup_source.select('p > em') result_sentence_list = soup_result.select('p > em') for sentence_index in range(len(source_sentence_list)): try: if source_sentence_list[sentence_index]["id"] == result_sentence_list[sentence_index]["id"] \ and result_sentence_list[sentence_index]["class"] == ['similar']: # if source_sentence_list[sentence_index]["id"] == result_sentence_list[sentence_index]["id"]: source_text = source_sentence_list[sentence_index].string result_text = result_sentence_list[sentence_index].string if source_text != None and result_text != None: data.append([source_sentence_list[sentence_index].string, result_sentence_list[sentence_index].string]) except: print(i,sentence_index) # print(data) def data_clean(text): # 清洗excel中的非法字符,都是不常见的不可显示字符,例如退格,响铃等 ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]') text = ILLEGAL_CHARACTERS_RE.sub(r'', text) return text df = pd.DataFrame(data,columns=["原文","yy降重"]) for col in df.columns: df[col] = df[col].apply(lambda x: data_clean(x)) df.to_excel("../data/论文_yy_小说.xlsx",index=None)