# -*- coding: utf-8 -*- """ @Time : 2023/2/1 19:18 @Author : @FileName: @Software: @Describe: """ import os import os from bs4 import BeautifulSoup import pandas as pd import re from tqdm import tqdm # 遍历文件夹 data_path_list = [] def walkFile(file): for root, dirs, files in os.walk(file): # root 表示当前访问的文件夹路径 # dirs 表示该文件夹下的子目录名list # files 表示该文件夹下的文件list # 遍历文件 for f in files: # print(os.path.join(root, f)) data_path_list.append(os.path.join(root, f)) # 遍历所有的文件夹 # for d in dirs: # print(os.path.join(root, d)) def main(): walkFile("../data/yy_reduce_data_20221219-20230131") main() data = [] rootpath_list = [] for i in data_path_list: danpath_list = str(i).split("\\") rootpath_list.append("\\".join(danpath_list[:-1])) print(len(rootpath_list)) rootpath_list = list(set(rootpath_list)) for i in tqdm(rootpath_list): try: soup_source = BeautifulSoup(open("{}\\source".format(i), encoding='utf-8'), "html.parser") soup_result = BeautifulSoup(open("{}\\result".format(i), encoding='utf-8'), "html.parser") except: continue source_sentence_list = soup_source.select('p > em') result_sentence_list = soup_result.select('p > em') for sentence_index in range(len(source_sentence_list)): try: if source_sentence_list[sentence_index]["id"] == result_sentence_list[sentence_index]["id"] \ and result_sentence_list[sentence_index]["class"] == ['similar']: # if source_sentence_list[sentence_index]["id"] == result_sentence_list[sentence_index]["id"]: source_text = source_sentence_list[sentence_index].string result_text = result_sentence_list[sentence_index].string if source_text != None and result_text != None: data.append([source_sentence_list[sentence_index].string, result_sentence_list[sentence_index].string]) except: pass # print(i,sentence_index) # print(data) def data_clean(text): # 清洗excel中的非法字符,都是不常见的不可显示字符,例如退格,响铃等 ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]') text = ILLEGAL_CHARACTERS_RE.sub(r'', text) return text df = pd.DataFrame(data,columns=["原文","yy降重"]) for col in df.columns: df[col] = df[col].apply(lambda x: data_clean(x)) df.to_excel("../data/论文_yy_小说_1.xlsx",index=None)