diff --git a/目录筛选.py b/目录筛选.py new file mode 100644 index 0000000..bc44620 --- /dev/null +++ b/目录筛选.py @@ -0,0 +1,131 @@ +import os +from tqdm import tqdm +import re + +# chatgpt生成的数据 +file = "./data/paper_prompt_title_3/title_mulu_prompt_data.txt" + + +with open(file, encoding="utf-8") as f: + text = f.read() + + +text_list = text.split("@" * 20)[:-1] + +# print(text_list1 + +pantten_title = "《(.*?)》生成目录,要求只有一级标题和二级标题," +pantten_biaoti = '[1-9一二三四五六七八九ⅠⅡⅢⅣⅤⅥⅦⅧⅨ][、.]\s{0,}?[\u4e00-\u9fa5a-zA-Z]+' +thanks = "致谢" +references = "参考文献" +excursus = "附录" + + +mulu_list_new = [] + +for text_dan in tqdm(text_list): + # print(text_dan) + try: + title_prompt, mulu = text_dan.split("**************") + except: + continue + result_biaoti_list = re.findall(pantten_title, title_prompt) + try: + result_biaoti_list[0] + except: + print(title_prompt) + continue + + title = str(result_biaoti_list[0]).strip("\n") + mulu = str(mulu).strip("\n") + + # 生成参考文件的提示文本 + + table_of_contents = [] + nerlable_list = [] + + # mulu_base64 = base64.b64encode(mulu.encode('utf-8')) + # mulu_path = os.path.join(uuid_path, "mulu.txt") + # with open(mulu_path, 'wb', encoding='utf8') as f2: + # f2.write(mulu_base64) + mulu_list = str(mulu).split("\n") + if mulu_list[0] == "目录": + mulu_list = mulu_list[1:] + mulu_list = [i.strip() for i in mulu_list if i != ""] + mulu_str = "@".join(mulu_list) + + mulu_list_bool = [] + for i in mulu_list: + result_biaoti_list = re.findall(pantten_biaoti, i) + if result_biaoti_list != []: + mulu_list_bool.append((i, "一级标题")) + else: + mulu_list_bool.append((i, "二级标题")) + + mulu_list_bool_part = mulu_list_bool[:3] + + if mulu_list_bool_part[0][1] != "一级标题": + continue + if mulu_list_bool_part[0][1] == mulu_list_bool_part[1][1] == mulu_list_bool_part[2][1] == "一级标题": + continue + + thanks_references_bool_table = mulu_list_bool[-5:] + + for i in thanks_references_bool_table: + try: + if references in i[0]: + mulu_list_bool.remove(i) + if thanks in i[0]: + mulu_list_bool.remove(i) + if excursus in i[0]: + mulu_list_bool.remove(i) + except: + + print(thanks_references_bool_table) + continue + + for i in mulu_list_bool: + if i[1] == "一级标题": + paper_dan = { + "title": "@@" + i[0], + "small_title": [], + "word_count": 0 + } + table_of_contents.append(paper_dan) + else: + table_of_contents[-1]["small_title"].append(i[0]) + + is_contine = False + if len(table_of_contents) < 6: + continue + else: + for big_title in table_of_contents[:-1]: + if len(big_title["small_title"]) < 2 or len(big_title["small_title"]) > 5: + is_contine = True + break + if is_contine == True: + continue + + # print(table_of_contents) + # + # print(len(table_of_contents)) + + table_of_contents_new = [] + for dabiaoti_index in range(len(table_of_contents)): + dabiaoti_dict = table_of_contents[dabiaoti_index] + dan_str_list = [dabiaoti_dict["title"][2:]] + dabiaoti_dict["small_title"] + dan_str = "\n".join(dan_str_list) + table_of_contents_new.append(dan_str) + + mulu_txt = "\n\n".join(table_of_contents_new) + + title_prompt = title_prompt.replace("\n", "\\n") + mulu_txt = mulu_txt.replace("\n", "\\n") + + mulu_list_new.append("**************".join([title_prompt,mulu_txt])) + + +with open("./data/训练数据集合/generate_mulu.txt", mode="w", encoding="utf-8") as f: + for i in mulu_list_new: + f.write(i) + f.write("\n") \ No newline at end of file