import json import re import math import numpy as np from tqdm import tqdm # pantten_second_biaoti = '[2二ⅡⅠ][、.]\s{0,}?[\u4e00-\u9fa5]+' pantten_biaoti = '[一二三四五六七八九][、]\s{0,}?[\u4e00-\u9fa5a-zA-Z]+' pantten_biaoti_1 = '[1-9].[1-9].[1-9](.*)' first_title_prompt = "论文题目是“{}”,目录是“{}”,请把其中的大标题“{}”的内容补充完整,补充内容字数在{}字左右" small_title_prompt = "论文题目是“{}”,目录是“{}”,请把其中的小标题“{}”的内容补充完整,补充内容字数在{}字左右" mulu_prompt = "请帮我根据题目为“{}”生成一个论文目录其中只含有一级标题和二级标题" thanks = "致谢" references = "参考文献" excursus = "附录" u = 3.5 # 均值μ sig = math.sqrt(6.0) zong_gradient = 6 paper_word_count = 12000 path = "../data/title.txt" with open(path, encoding="utf-8") as f: text = f.read() def normal_distribution(x): y = np.exp(-(x - u) ** 2 / (2 * sig ** 2)) / (math.sqrt(2 * math.pi) * sig) return y text_list = text.split("\n++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n") ner_lable = [] text_zong = [] train_list = [] for text_dan in tqdm(text_list): tiaoguo = False # print(text_dan) try: title, mulu = text_dan.split("**********************************************") except: continue title = str(title).strip("\n") mulu = str(mulu).strip("\n") paper_text = "题目:{}@目录:".format(title) nerlable_list = [] # mulu_base64 = base64.b64encode(mulu.encode('utf-8')) # mulu_path = os.path.join(uuid_path, "mulu.txt") # with open(mulu_path, 'wb', encoding='utf8') as f2: # f2.write(mulu_base64) mulu_list = str(mulu).split("\n") mulu_list = [i.strip() for i in mulu_list if i != ""] mulu_list_bool = [] for i in mulu_list: result_biaoti_list = re.findall(pantten_biaoti, i) if result_biaoti_list != []: mulu_list_bool.append((i, "一级标题")) else: result_biaoti_sanji_list = re.findall(pantten_biaoti, i) if result_biaoti_sanji_list != []: tiaoguo = True break else: mulu_list_bool.append((i, "二级标题")) if tiaoguo == True: continue mulu_list_bool_part = mulu_list_bool[:3] if mulu_list_bool_part[0][1] != "一级标题": continue if mulu_list_bool_part[0][1] == mulu_list_bool_part[1][1] == "一级标题": continue if mulu_list_bool_part[-1][1] == "一级标题": continue thanks_references_bool_table = mulu_list_bool[-5:] for i in thanks_references_bool_table: try: if references in i[0]: mulu_list_bool.remove(i) if thanks in i[0]: mulu_list_bool.remove(i) if excursus in i[0]: mulu_list_bool.remove(i) except: print(thanks_references_bool_table) continue table_of_contents = [] for i in mulu_list_bool: if i[1] == "一级标题": paper_dan = { "title": "@@" + i[0], "small_title": [i[0]], "word_count": 0 } table_of_contents.append(paper_dan) else: table_of_contents[-1]["small_title"].append(i[0]) table_of_contents_new = [] for i in table_of_contents: a = "\n".join(i["small_title"]) table_of_contents_new.append(a) b = "\n\n".join(table_of_contents_new) title_p = mulu_prompt.format(title) train_list.append({"content": str(title_p), "summary": str(b)}) print(train_list) import random random.shuffle(train_list) train_list_shuffle = train_list[:500] with open("../data/mulu_prompt.txt", mode="w", encoding="utf-8") as f: for i in train_list: f.write(json.dumps(i, ensure_ascii=False)) f.write("\n") with open("../data/mulu_prompt_shuffle.json", mode="w", encoding="utf-8") as f: for i in train_list_shuffle: f.write(json.dumps(i, ensure_ascii=False)) f.write("\n")