import json
import re
import math
import numpy as np
from tqdm import tqdm


prompt = "请把“{}”这段文字翻译成英文"
chinese_keyword_prompt = "请为“{}”这段论文摘要生成3-5个关键字,使用阿拉伯数字作为序号标注,例如“1.xxx \n2.xxx \n3.xxx \n4.xxx \n5.xxx \n”"
pantten_title = "(.*?)》为题目生成论文摘要,要求生成的字数在"






path = "./data/paper_prompt_title_3/title_zhaiyao_prompt_data.txt"
with open(path, encoding="utf-8") as f:
    text = f.read()


text_list = text.split("请以《")

data_list = []
chinese_keyword_data_list = []

for text_dan in tqdm(text_list):
    # print(text_dan)
    try:
        title_prompt, zhaiyao = text_dan.split("**************")
    except:
        continue
    result_biaoti_list = re.findall(pantten_title, title_prompt)
    try:
        result_biaoti_list[0]
    except:
        print(title_prompt)
        continue

    title = str(result_biaoti_list[0]).strip("\n")
    zhaiyao = str(zhaiyao).strip("\n")

    data_list.append(prompt.format(zhaiyao))
    chinese_keyword_data_list.append(chinese_keyword_prompt.format(zhaiyao))


import random

random.shuffle(data_list)

with open("./data/zhaiyao_to_/zhaiyao_fanyi_prompt.txt", mode="w", encoding="utf-8") as f:
    for i in data_list:
        f.write(json.dumps(i, ensure_ascii=False))
        f.write("\n")

random.shuffle(chinese_keyword_data_list)

with open("./data/zhaiyao_to_/zhaiyao_chinese_keyword_prompt.txt", mode="w", encoding="utf-8") as f:
    for i in chinese_keyword_data_list:
        f.write(json.dumps(i, ensure_ascii=False))
        f.write("\n")



#     for lable in table_of_contents:
#         text_len = len(paper_text)
#         dan_nerlable = [text_len, text_len + len(lable[0]), lable[1]]
#         nerlable_list.append(dan_nerlable)
#         paper_text += lable[0]
#         paper_text += "@"
#
#     paper_dan = {"text": paper_text, "label": nerlable_list}
#
#     ner_lable.append(str(table_of_contents))
#     text_zong.append(paper_dan)
#
# with open("../data/train.txt", mode="w", encoding="utf-8") as f:
#     for i in text_zong:
#         f.write(json.dumps(i, ensure_ascii=False))
#         f.write("\n")
#
#
# with open("../data/train_lable.txt", mode="w") as f:
#     for i in ner_lable:
#         f.write(json.dumps(i, ensure_ascii=False))
#         f.write("\n")