import os import json import re import math import numpy as np from tqdm import tqdm def is_contains_chinese(strs): for _char in strs: if '\u4e00' <= _char <= '\u9fa5': return True return False # pantten_second_biaoti = '[2二ⅡⅠ][、.]\s{0,}?[\u4e00-\u9fa5]+' lable_data_amount = { "title_beijing_prompt_data.txt": {"num_token": 5000, "prompt": "生成论文来源的背景#"}, "title_jianjie_prompt_data.txt": {"num_token": 5000, "prompt": "生成研究内容#"}, "title_mulu_prompt_data.txt": {"num_token": 5000, "prompt": "生成目录#"}, "title_yanjiubeijingyiyi_prompt_data.txt": {"num_token": 5000, "prompt": "生成课题的研究背景和意义#"}, "title_zongjie_prompt_data.txt": {"num_token": 5000, "prompt": "生成论文简短总结#"}, "title_zongshu_prompt_data.txt": {"num_token": 5000, "prompt": "生成课题的国内外研究状况综述#"}, "jianjie_task_book_prompt_data.txt": {"num_token": 5000, "prompt": "生成6点本篇论文应完成的主要内容#"}, "title_mulu_references_prompt_data.txt": {"num_token": 1, "prompt": "生成参考文献#"}, "title_mulu_small_title_prompt_shuffle_data.txt": {"num_token": 18730, "prompt": "生成论文小标题内容#"}, "title_mulu_zhaiyao_data.txt": {"num_token": 5000, "prompt": "生成论文摘要#"}, "zhaiyao_chinese_keyword_prompt_data.txt": {"num_token": 5000, "prompt": "生成关键字#"}, "zhaiyao_fanyi_prompt_data.txt": {"num_token": 5000, "prompt": "翻译摘要#"}, "chinese_keyword_en_prompt_data.txt": {"num_token": 5000, "prompt": "翻译关键词#"}, "title_hexin_beijing_prompt_data.txt": {"num_token": 4971, "prompt": "生成论文来源的背景#"}, "title_hexin_jianjie_prompt_data.txt": {"num_token": 4903, "prompt": "生成研究内容#"}, "title_hexin_mulu_prompt_data.txt": {"num_token": 4954, "prompt": "生成目录#"}, "title_hexin_yanjiubeijingyiyi_prompt_data.txt": {"num_token": 4902, "prompt": "生成课题的研究背景和意义#"}, "title_hexin_zongjie_prompt_data.txt": {"num_token": 4971, "prompt": "生成论文简短总结#"}, "title_hexin_zongshu_prompt_data.txt": {"num_token": 4671, "prompt": "生成课题的国内外研究状况综述#"} } re_file = { "title_beijing_prompt_data.txt": "\n以“", "title_jianjie_prompt_data.txt": "\n请帮我生成《", "title_mulu_prompt_data.txt": "\n为论文题目“", "title_yanjiubeijingyiyi_prompt_data.txt": "\n请分别写出以《", "title_zongjie_prompt_data.txt": "\n以“", "title_zongshu_prompt_data.txt": "\n请写出以《", "jianjie_task_book_prompt_data.txt": "\n\"请根据题目为《", "title_mulu_references_prompt_data.txt": "\n\"论文题目是“", "zhaiyao_chinese_keyword_prompt_data.txt": "\n\"请为“", "zhaiyao_fanyi_prompt_data.txt": "\n\"请把“", "chinese_keyword_en_prompt_data.txt": "\n\"请把“", "title_mulu_zhaiyao_data.txt": "@@@@@@@@@@@@@@@@@@", "title_mulu_small_title_prompt_shuffle_data.txt": "@@@@@@@@@@@@@@@@@@", "title_hexin_beijing_prompt_data.txt": "@@@@@@@@@@@@@@@@@@@@@@@", "title_hexin_jianjie_prompt_data.txt": "@@@@@@@@@@@@@@@@@@@@@@@", "title_hexin_mulu_prompt_data.txt": "@@@@@@@@@@@@@@@@@@@@@@@", "title_hexin_yanjiubeijingyiyi_prompt_data.txt": "@@@@@@@@@@@@@@@@@@@@@@@", "title_hexin_zongjie_prompt_data.txt": "@@@@@@@@@@@@@@@@@@@@@@@", "title_hexin_zongshu_prompt_data.txt": "@@@@@@@@@@@@@@@@@@@@@@@" } split_teshu = [ "title_mulu_zhaiyao_data.txt", "title_mulu_small_title_prompt_shuffle_data.txt", "title_hexin_beijing_prompt_data.txt", "title_hexin_jianjie_prompt_data.txt", "title_hexin_mulu_prompt_data.txt", "title_hexin_yanjiubeijingyiyi_prompt_data.txt", "title_hexin_zongjie_prompt_data.txt", "title_hexin_zongshu_prompt_data.txt" ] path_list = [] file = "./data/paper_prompt_title_3" for root, dirs, files in os.walk(file): for file in files: path = os.path.join(root, file) path_list.append(path) file = "./data/paper_prompt_title_3_1" for root, dirs, files in os.walk(file): for file in files: path = os.path.join(root, file) path_list.append(path) file = "./data/paper_prompt_title_3_1_1" for root, dirs, files in os.walk(file): for file in files: path = os.path.join(root, file) path_list.append(path) file = "./data/paper_prompt_title_hexin_3" for root, dirs, files in os.walk(file): for file in files: path = os.path.join(root, file) path_list.append(path) text_list_new = [] tongji = {} for path in path_list: task_name = path.split("\\")[-1] if task_name in re_file: spilt_dan = re_file[task_name] else: continue train_data_amount = lable_data_amount[task_name] with open(path, encoding="utf-8") as f: text = f.read() text_list = text.split(spilt_dan) index = 1 while True: if index >= train_data_amount: break data_dan = text_list[index] if "**************" in data_dan: # if task_name == "title_jianjie_prompt_data.txt": # content, summary = data_dan.split("**************") # bool_ = is_contains_chinese(summary) # if bool_ == False: # index += 1 # continue if task_name in split_teshu: data_dan = data_dan else: data_dan = spilt_dan[1:] + data_dan text_list_new.append(data_dan) index += 1 if task_name not in tongji: tongji[task_name] = 1 else: tongji[task_name] += 1 else: index += 4 print(data_dan) # train_list.append({"content": str(title_p), "summary": str(b)}) train_list = [] for text in text_list_new: content, summary = text.split("**************") train_list.append( {"content": str(content).strip("\"").strip("\n").strip("\""), "summary": str(summary)} ) import random random.shuffle(train_list) for i in tongji: print(i, tongji[i]) with open("./data/chatglm_paper_data_2.txt", mode="w", encoding="utf-8") as f: for i in train_list: f.write(json.dumps(i, ensure_ascii=False)) f.write("\n")