import os from tqdm import tqdm import re # chatgpt生成的数据 file = "./data/paper_prompt_title_3/title_mulu_prompt_data.txt" with open(file, encoding="utf-8") as f: text = f.read() text_list = text.split("@" * 20)[:-1] # print(text_list1 pantten_title = "《(.*?)》生成目录,要求只有一级标题和二级标题," pantten_biaoti = '[1-9一二三四五六七八九ⅠⅡⅢⅣⅤⅥⅦⅧⅨ][、.]\s{0,}?[\u4e00-\u9fa5a-zA-Z]+' thanks = "致谢" references = "参考文献" excursus = "附录" mulu_list_new = [] for text_dan in tqdm(text_list): # print(text_dan) try: title_prompt, mulu = text_dan.split("**************") except: continue result_biaoti_list = re.findall(pantten_title, title_prompt) try: result_biaoti_list[0] except: print(title_prompt) continue title = str(result_biaoti_list[0]).strip("\n") mulu = str(mulu).strip("\n") # 生成参考文件的提示文本 table_of_contents = [] nerlable_list = [] # mulu_base64 = base64.b64encode(mulu.encode('utf-8')) # mulu_path = os.path.join(uuid_path, "mulu.txt") # with open(mulu_path, 'wb', encoding='utf8') as f2: # f2.write(mulu_base64) mulu_list = str(mulu).split("\n") if mulu_list[0] == "目录": mulu_list = mulu_list[1:] mulu_list = [i.strip() for i in mulu_list if i != ""] mulu_str = "@".join(mulu_list) mulu_list_bool = [] for i in mulu_list: result_biaoti_list = re.findall(pantten_biaoti, i) if result_biaoti_list != []: mulu_list_bool.append((i, "一级标题")) else: mulu_list_bool.append((i, "二级标题")) mulu_list_bool_part = mulu_list_bool[:3] if mulu_list_bool_part[0][1] != "一级标题": continue if mulu_list_bool_part[0][1] == mulu_list_bool_part[1][1] == mulu_list_bool_part[2][1] == "一级标题": continue thanks_references_bool_table = mulu_list_bool[-5:] for i in thanks_references_bool_table: try: if references in i[0]: mulu_list_bool.remove(i) if thanks in i[0]: mulu_list_bool.remove(i) if excursus in i[0]: mulu_list_bool.remove(i) except: print(thanks_references_bool_table) continue for i in mulu_list_bool: if i[1] == "一级标题": paper_dan = { "title": "@@" + i[0], "small_title": [], "word_count": 0 } table_of_contents.append(paper_dan) else: table_of_contents[-1]["small_title"].append(i[0]) is_contine = False if len(table_of_contents) < 6: continue else: for big_title in table_of_contents[:-1]: if len(big_title["small_title"]) < 2 or len(big_title["small_title"]) > 5: is_contine = True break if is_contine == True: continue # print(table_of_contents) # # print(len(table_of_contents)) table_of_contents_new = [] for dabiaoti_index in range(len(table_of_contents)): dabiaoti_dict = table_of_contents[dabiaoti_index] dan_str_list = [dabiaoti_dict["title"][2:]] + dabiaoti_dict["small_title"] dan_str = "\n".join(dan_str_list) table_of_contents_new.append(dan_str) mulu_txt = "\n\n".join(table_of_contents_new) title_prompt = title_prompt.replace("\n", "\\n") mulu_txt = mulu_txt.replace("\n", "\\n") mulu_list_new.append("**************".join([title_prompt,mulu_txt])) with open("./data/训练数据集合/generate_mulu.txt", mode="w", encoding="utf-8") as f: for i in mulu_list_new: f.write(i) f.write("\n")