import os import json import re import math import numpy as np from tqdm import tqdm patten = "目录是“(.*)”,请把其中的" pantten_biaoti = '[1-9一二三四五六七八九ⅠⅡⅢⅣⅤⅥⅦⅧⅨ][、.]\s{0,}?[\u4e00-\u9fa5a-zA-Z]+' thanks = "致谢" references = "参考文献" excursus = "附录" def ulit_mulu(mulu): # 生成参考文件的提示文本 table_of_contents = [] if "\\n" in mulu: mulu_list = str(mulu).split("\\n") elif "\n" in mulu: mulu_list = str(mulu).split("\n") else: return False, "" if mulu_list[0] == "目录": mulu_list = mulu_list[1:] mulu_list = [i.strip() for i in mulu_list if i != ""] mulu_list_bool = [] for i in mulu_list: result_biaoti_list = re.findall(pantten_biaoti, i) if result_biaoti_list != []: mulu_list_bool.append((i, "一级标题")) else: mulu_list_bool.append((i, "二级标题")) mulu_list_bool_part = mulu_list_bool[:3] if mulu_list_bool_part[0][1] != "一级标题": return False, "" if mulu_list_bool_part[0][1] == mulu_list_bool_part[1][1] == mulu_list_bool_part[2][1] == "一级标题": return False, "" thanks_references_bool_table = mulu_list_bool[-5:] for i in thanks_references_bool_table: try: if references in i[0]: mulu_list_bool.remove(i) if thanks in i[0]: mulu_list_bool.remove(i) if excursus in i[0]: mulu_list_bool.remove(i) except: print(thanks_references_bool_table) return False, "" for i in mulu_list_bool: if i[1] == "一级标题": paper_dan = { "title": "@@" + i[0], "small_title": [], "word_count": 0 } table_of_contents.append(paper_dan) else: table_of_contents[-1]["small_title"].append(i[0]) is_contine = False for big_title in table_of_contents[:-1]: if len(big_title["small_title"]) < 2 or len(big_title["small_title"]) > 5: is_contine = True break if is_contine == True: return False, "" # print(table_of_contents) # # print(len(table_of_contents)) table_of_contents_new = [] for dabiaoti_index in range(len(table_of_contents)): dabiaoti_dict = table_of_contents[dabiaoti_index] dan_str_list = [dabiaoti_dict["title"][2:]] + dabiaoti_dict["small_title"] dan_str = "\n".join(dan_str_list) table_of_contents_new.append(dan_str) mulu_txt = "\n\n".join(table_of_contents_new) mulu_txt = mulu_txt.replace("\n", "\\n") return True, mulu_txt def is_contains_chinese(strs): for _char in strs: if '\u4e00' <= _char <= '\u9fa5': return True return False # pantten_second_biaoti = '[2二ⅡⅠ][、.]\s{0,}?[\u4e00-\u9fa5]+' lable_data_amount = { "title_beijing_prompt_data.txt": {"num_token": -1, "prompt": "生成论文来源的背景#"}, "title_jianjie_prompt_data.txt": {"num_token": -1, "prompt": "生成研究内容#"}, "title_yanjiubeijingyiyi_prompt_data.txt": {"num_token": -1, "prompt": "生成课题的研究背景和意义#"}, "title_zongjie_prompt_data.txt": {"num_token": -1, "prompt": "生成论文简短总结#"}, "title_zongshu_prompt_data.txt": {"num_token": -1, "prompt": "生成课题的国内外研究状况综述#"}, "jianjie_task_book_prompt_data.txt": {"num_token": -1, "prompt": "生成6点本篇论文应完成的主要内容#"}, "title_mulu_references_prompt_data.txt": {"num_token": 1, "prompt": "生成参考文献#"}, "title_mulu_small_title_prompt_shuffle_data_new.txt": {"num_token": -1, "prompt": "生成论文小标题内容#"}, "title_mulu_zhaiyao_data.txt": {"num_token": -1, "prompt": "生成论文摘要#"}, "zhaiyao_chinese_keyword_prompt_data.txt": {"num_token": -1, "prompt": "生成关键字#"}, "zhaiyao_fanyi_prompt_data.txt": {"num_token": -1, "prompt": "翻译摘要#"}, "chinese_keyword_en_prompt_data.txt": {"num_token": -1, "prompt": "翻译关键词#"}, "title_hexin_beijing_prompt_data.txt": {"num_token": -1, "prompt": "生成论文来源的背景#"}, "title_hexin_jianjie_prompt_data.txt": {"num_token": -1, "prompt": "生成研究内容#"}, "title_hexin_mulu_prompt_data.txt": {"num_token": -1, "prompt": "生成目录#"}, "title_hexin_yanjiubeijingyiyi_prompt_data.txt": {"num_token": -1, "prompt": "生成课题的研究背景和意义#"}, "title_hexin_zongjie_prompt_data.txt": {"num_token": -1, "prompt": "生成论文简短总结#"}, "title_hexin_zongshu_prompt_data.txt": {"num_token": -1, "prompt": "生成课题的国内外研究状况综述#"} } re_file = { "title_beijing_prompt_data.txt": "\n以“", "title_jianjie_prompt_data.txt": "\n请帮我生成《", "title_yanjiubeijingyiyi_prompt_data.txt": "\n请分别写出以《", "title_zongjie_prompt_data.txt": "\n以“", "title_zongshu_prompt_data.txt": "\n请写出以《", "jianjie_task_book_prompt_data.txt": "\n\"请根据题目为《", "title_mulu_references_prompt_data.txt": "\n\"论文题目是“", "zhaiyao_chinese_keyword_prompt_data.txt": "\n\"请为“", "zhaiyao_fanyi_prompt_data.txt": "\n\"请把“", "chinese_keyword_en_prompt_data.txt": "\n\"请把“", "title_mulu_zhaiyao_data.txt": "@@@@@@@@@@@@@@@@@@", "title_mulu_small_title_prompt_shuffle_data_new.txt": "@@@@@@@@@@@@@@@@@@", "title_hexin_beijing_prompt_data.txt": "@@@@@@@@@@@@@@@@@@@@@@@", "title_hexin_jianjie_prompt_data.txt": "@@@@@@@@@@@@@@@@@@@@@@@", "title_hexin_mulu_prompt_data.txt": "@@@@@@@@@@@@@@@@@@@@@@@", "title_hexin_yanjiubeijingyiyi_prompt_data.txt": "@@@@@@@@@@@@@@@@@@@@@@@", "title_hexin_zongjie_prompt_data.txt": "@@@@@@@@@@@@@@@@@@@@@@@", "title_hexin_zongshu_prompt_data.txt": "@@@@@@@@@@@@@@@@@@@@@@@" } split_teshu = [ "title_mulu_zhaiyao_data.txt", "title_mulu_small_title_prompt_shuffle_data_new.txt", "title_hexin_beijing_prompt_data.txt", "title_hexin_jianjie_prompt_data.txt", "title_hexin_mulu_prompt_data.txt", "title_hexin_yanjiubeijingyiyi_prompt_data.txt", "title_hexin_zongjie_prompt_data.txt", "title_hexin_zongshu_prompt_data.txt" ] patten_mulu = { "title_mulu_references_prompt_data.txt": "目录是“(.*)”,请为这篇论文生成15篇左右的参考文献", "title_mulu_small_title_prompt_shuffle_data_new.txt": "目录是“(.*)”,请把其中的小标题", "title_mulu_zhaiyao_data.txt": "目录是“(.*)”,生成论文摘要" } shuminghao = { "title_beijing_prompt_data.txt": [("以“","以《"),("”为论文题目","》为论文题目")], "title_hexin_beijing_prompt_data.txt": [("以“","以《"),("”为论文题目","》为论文题目")], "title_hexin_mulu_prompt_data.txt": [("论文题目为“","论文题目为《"),("”,以“","》,以“")], # 论文题目为“关于《金子美玲童谣全集》中起点文本向目标文本转换的研究”,以“ "title_hexin_zongjie_prompt_data.txt":[("以“","以《"),("”为论文题目","》为论文题目")], # 以“面向海量微服务的高可用服务注册中心的研究与实现”为论文题目 "title_mulu_small_title_prompt_shuffle_data_new.txt": [("论文题目是“", "论文题目是《"), ("”,目录是", "》,目录是")], # "论文题目是“八十年代审美非功利思潮研究”,目录是 "title_mulu_zhaiyao_data.txt": [("论文题目是“", "论文题目是《"), ("”,目录是", "》,目录是")], # "论文题目是“网络媒体报道对房地产市场的影响研究”,目录是 "title_zongjie_prompt_data.txt": [("以“", "以《"), ("”为论文题目", "》为论文题目")] # 以“网中人:论哈金《等待》中的伦理困境”为论文题目 } path_list = [] file = "./data/paper_prompt_title_3" for root, dirs, files in os.walk(file): for file in files: path = os.path.join(root, file) path_list.append(path) file = "./data/paper_prompt_title_3_1" for root, dirs, files in os.walk(file): for file in files: path = os.path.join(root, file) path_list.append(path) file = "./data/paper_prompt_title_3_1_1" for root, dirs, files in os.walk(file): for file in files: path = os.path.join(root, file) path_list.append(path) file = "./data/paper_prompt_title_hexin_3" for root, dirs, files in os.walk(file): for file in files: path = os.path.join(root, file) path_list.append(path) text_list_new = [] tongji = {} for path in path_list: patten_mulu_bool = False shuminghao_bool = False new_data_list = [] patten_mulu_patten = "" shuminghao_list = "" task_name = path.split("\\")[-1] if task_name in re_file: spilt_dan = re_file[task_name] else: continue if task_name in patten_mulu: patten_mulu_bool = True patten_mulu_patten = patten_mulu[task_name] if task_name in shuminghao: shuminghao_bool = True shuminghao_list = shuminghao[task_name] train_data_amount_dict = lable_data_amount[task_name] train_data_amount = train_data_amount_dict["num_token"] with open(path, encoding="utf-8") as f: text = f.read() text_list = text.split(spilt_dan) index = 1 if train_data_amount == -1: train_data_amount = len(text_list) -1 while True: if index >= train_data_amount: break data_dan = text_list[index] if "**************" in data_dan: if task_name in split_teshu: data_dan = data_dan else: data_dan = spilt_dan[1:] + data_dan # text_list_new.append(data_dan) if patten_mulu_bool == True: content, summary = data_dan.split("**************") result_biaoti_list = re.findall(patten_mulu_patten, content) try: mulu = str(result_biaoti_list[0]) except: index += 1 continue bool_, mulu_new = ulit_mulu(mulu) if bool_ == True: content = content.replace(mulu,mulu_new) data_dan = "**************".join([content, summary]) data_dan = data_dan.replace("\\n", "\n").replace("\n", "\\n") else: index += 1 continue else: data_dan = data_dan.replace("\\n", "\n").replace("\n", "\\n") if shuminghao_bool == True: content, summary = data_dan.split("**************") for rep in shuminghao_list: content = content.replace(rep[0], rep[1]) data_dan = "**************".join([content, summary]) new_data_list.append(data_dan) index += 1 if task_name not in tongji: tongji[task_name] = 1 else: tongji[task_name] += 1 else: index += 4 print(data_dan) print(task_name, "\n") if new_data_list!= []: print(new_data_list[0]) with open(f"./data/训练数据集合/{task_name}", mode="w", encoding="utf-8") as f: for i in new_data_list: f.write(i) f.write("\n") # train_list.append({"content": str(title_p), "summary": str(b)}) train_list = [] for text, prompt in text_list_new: content, summary = text.split("**************") train_list.append( {"query": str(content).strip("\"").strip("\n").strip("\""), "response": str(summary), "prompt": prompt} ) import random random.shuffle(train_list) for i in tongji: print(i, tongji[i]) with open("./data/chatglm_paper_data_2_prompt.txt", mode="w", encoding="utf-8") as f: for i in train_list: f.write(json.dumps(i, ensure_ascii=False)) f.write("\n")