# -*- coding: utf-8 -*-

"""
@Time    :  2023/2/1 19:18
@Author  : 
@FileName: 
@Software: 
@Describe:
"""

import os
import os
from bs4 import BeautifulSoup
import pandas as pd
import re
from tqdm import tqdm
# 遍历文件夹


data_path_list = []

def walkFile(file):
    for root, dirs, files in os.walk(file):
        # root 表示当前访问的文件夹路径
        # dirs 表示该文件夹下的子目录名list
        # files 表示该文件夹下的文件list
        # 遍历文件
        for f in files:
            # print(os.path.join(root, f))
            data_path_list.append(os.path.join(root, f))
        # 遍历所有的文件夹
        # for d in dirs:
        #     print(os.path.join(root, d))
def main():
    walkFile("../data/yy_reduce_data_20221219-20230131")

main()


data = []

rootpath_list = []
for i in data_path_list:
    danpath_list = str(i).split("\\")
    rootpath_list.append("\\".join(danpath_list[:-1]))

print(len(rootpath_list))
rootpath_list = list(set(rootpath_list))
for i in tqdm(rootpath_list):
    try:
        soup_source = BeautifulSoup(open("{}\\source".format(i), encoding='utf-8'),
                                    "html.parser")
        soup_result = BeautifulSoup(open("{}\\result".format(i), encoding='utf-8'),
                             "html.parser")
    except:
        continue

    source_sentence_list = soup_source.select('p > em')
    result_sentence_list = soup_result.select('p > em')
    for sentence_index in range(len(source_sentence_list)):
        try:
            if source_sentence_list[sentence_index]["id"] == result_sentence_list[sentence_index]["id"] \
                    and result_sentence_list[sentence_index]["class"] == ['similar']:
            # if source_sentence_list[sentence_index]["id"] == result_sentence_list[sentence_index]["id"]:
                source_text = source_sentence_list[sentence_index].string
                result_text = result_sentence_list[sentence_index].string
                if source_text != None and result_text != None:
                    data.append([source_sentence_list[sentence_index].string, result_sentence_list[sentence_index].string])
        except:
            pass
        #     print(i,sentence_index)

    # print(data)


def data_clean(text):
    # 清洗excel中的非法字符,都是不常见的不可显示字符,例如退格,响铃等
    ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]')
    text = ILLEGAL_CHARACTERS_RE.sub(r'', text)
    return text


df = pd.DataFrame(data,columns=["原文","yy降重"])
for col in df.columns:
    df[col] = df[col].apply(lambda x: data_clean(x))

df.to_excel("../data/论文_yy_小说_1.xlsx",index=None)