import os
import json
import re
import math
import numpy as np
from tqdm import tqdm

def is_contains_chinese(strs):
    for _char in strs:
        if '\u4e00' <= _char <= '\u9fa5':
            return True
    return False
# pantten_second_biaoti = '[2二ⅡⅠ][、.]\s{0,}?[\u4e00-\u9fa5]+'

lable_data_amount = {
    "title_beijing_prompt_data.txt": {"num_token": 5000, "prompt": "生成论文来源的背景#"},
    "title_jianjie_prompt_data.txt": {"num_token": 5000, "prompt": "生成研究内容#"},
    "title_mulu_prompt_data.txt": {"num_token": 5000, "prompt": "生成目录#"},
    "title_yanjiubeijingyiyi_prompt_data.txt": {"num_token": 5000, "prompt": "生成课题的研究背景和意义#"},
    "title_zongjie_prompt_data.txt": {"num_token": 5000, "prompt": "生成论文简短总结#"},
    "title_zongshu_prompt_data.txt": {"num_token": 5000, "prompt": "生成课题的国内外研究状况综述#"},
    "jianjie_task_book_prompt_data.txt": {"num_token": 5000, "prompt": "生成6点本篇论文应完成的主要内容#"},
    "title_mulu_references_prompt_data.txt": {"num_token": 1, "prompt": "生成参考文献#"},
    "title_mulu_small_title_prompt_shuffle_data.txt": {"num_token": 18730, "prompt": "生成论文小标题内容#"},
    "title_mulu_zhaiyao_data.txt": {"num_token": 5000, "prompt": "生成论文摘要#"},
    "zhaiyao_chinese_keyword_prompt_data.txt": {"num_token": 5000, "prompt": "生成关键字#"},
    "zhaiyao_fanyi_prompt_data.txt": {"num_token": 5000, "prompt": "翻译摘要#"},
    "chinese_keyword_en_prompt_data.txt": {"num_token": 5000, "prompt": "翻译关键词#"},
    "title_hexin_beijing_prompt_data.txt": {"num_token": 4971, "prompt": "生成论文来源的背景#"},
    "title_hexin_jianjie_prompt_data.txt": {"num_token": 4903, "prompt": "生成研究内容#"},
    "title_hexin_mulu_prompt_data.txt": {"num_token": 4954, "prompt": "生成目录#"},
    "title_hexin_yanjiubeijingyiyi_prompt_data.txt": {"num_token": 4902, "prompt": "生成课题的研究背景和意义#"},
    "title_hexin_zongjie_prompt_data.txt": {"num_token": 4971, "prompt": "生成论文简短总结#"},
    "title_hexin_zongshu_prompt_data.txt": {"num_token": 4671, "prompt": "生成课题的国内外研究状况综述#"}
}

re_file = {
    "title_beijing_prompt_data.txt": "\n以“",
    "title_jianjie_prompt_data.txt": "\n请帮我生成《",
    "title_mulu_prompt_data.txt": "\n为论文题目“",
    "title_yanjiubeijingyiyi_prompt_data.txt": "\n请分别写出以《",
    "title_zongjie_prompt_data.txt": "\n以“",
    "title_zongshu_prompt_data.txt": "\n请写出以《",
    "jianjie_task_book_prompt_data.txt": "\n\"请根据题目为《",
    "title_mulu_references_prompt_data.txt": "\n\"论文题目是“",
    "zhaiyao_chinese_keyword_prompt_data.txt": "\n\"请为“",
    "zhaiyao_fanyi_prompt_data.txt": "\n\"请把“",
    "chinese_keyword_en_prompt_data.txt": "\n\"请把“",
    "title_mulu_zhaiyao_data.txt": "@@@@@@@@@@@@@@@@@@",
    "title_mulu_small_title_prompt_shuffle_data.txt": "@@@@@@@@@@@@@@@@@@",
    "title_hexin_beijing_prompt_data.txt": "@@@@@@@@@@@@@@@@@@@@@@@",
    "title_hexin_jianjie_prompt_data.txt": "@@@@@@@@@@@@@@@@@@@@@@@",
    "title_hexin_mulu_prompt_data.txt": "@@@@@@@@@@@@@@@@@@@@@@@",
    "title_hexin_yanjiubeijingyiyi_prompt_data.txt": "@@@@@@@@@@@@@@@@@@@@@@@",
    "title_hexin_zongjie_prompt_data.txt": "@@@@@@@@@@@@@@@@@@@@@@@",
    "title_hexin_zongshu_prompt_data.txt": "@@@@@@@@@@@@@@@@@@@@@@@"
}

split_teshu = [
    "title_mulu_zhaiyao_data.txt",
    "title_mulu_small_title_prompt_shuffle_data.txt",
    "title_hexin_beijing_prompt_data.txt",
    "title_hexin_jianjie_prompt_data.txt",
    "title_hexin_mulu_prompt_data.txt",
    "title_hexin_yanjiubeijingyiyi_prompt_data.txt",
    "title_hexin_zongjie_prompt_data.txt",
    "title_hexin_zongshu_prompt_data.txt"
    ]

path_list = []
file = "./data/paper_prompt_title_3"
for root, dirs, files in os.walk(file):
    for file in files:
        path = os.path.join(root, file)
        path_list.append(path)

file = "./data/paper_prompt_title_3_1"
for root, dirs, files in os.walk(file):
    for file in files:
        path = os.path.join(root, file)
        path_list.append(path)

file = "./data/paper_prompt_title_3_1_1"
for root, dirs, files in os.walk(file):
    for file in files:
        path = os.path.join(root, file)
        path_list.append(path)

file = "./data/paper_prompt_title_hexin_3"
for root, dirs, files in os.walk(file):
    for file in files:
        path = os.path.join(root, file)
        path_list.append(path)


text_list_new = []

tongji = {}


for path in path_list:
    task_name = path.split("\\")[-1]
    if task_name in re_file:
        spilt_dan = re_file[task_name]
    else:
        continue

    train_data_amount = lable_data_amount[task_name]

    with open(path, encoding="utf-8") as f:
        text = f.read()
        text_list = text.split(spilt_dan)
    index = 1
    while True:
        if index >= train_data_amount:
            break
        data_dan = text_list[index]
        if "**************" in data_dan:
            # if task_name == "title_jianjie_prompt_data.txt":
            #     content, summary = data_dan.split("**************")
            #     bool_ = is_contains_chinese(summary)
            #     if bool_ == False:
            #         index += 1
            #         continue
            if task_name in split_teshu:
                data_dan = data_dan
            else:
                data_dan = spilt_dan[1:] + data_dan
            text_list_new.append(data_dan)
            index += 1
            if task_name not in tongji:
                tongji[task_name] = 1
            else:
                tongji[task_name] += 1
        else:
            index += 4
            print(data_dan)

# train_list.append({"content": str(title_p), "summary": str(b)})

train_list = []
for text in text_list_new:
    content, summary = text.split("**************")
    train_list.append(
        {"content": str(content).strip("\"").strip("\n").strip("\""), "summary": str(summary)}
    )

import random
random.shuffle(train_list)


for i in tongji:
    print(i, tongji[i])
with open("./data/chatglm_paper_data_2.txt", mode="w", encoding="utf-8") as f:
    for i in train_list:
        f.write(json.dumps(i, ensure_ascii=False))
        f.write("\n")