import json import re import math import numpy as np from tqdm import tqdm prompt = "请把“{}”这段文字翻译成英文" chinese_keyword_prompt = "请为“{}”这段论文摘要生成3-5个关键字,使用阿拉伯数字作为序号标注,例如“1.xxx \n2.xxx \n3.xxx \n4.xxx \n5.xxx \n”" pantten_title = "《(.*?)》为题目生成论文摘要,要求生成的字数在" # path = "./data/paper_prompt_title_3/title_zhaiyao_prompt_data.txt" # with open(path, encoding="utf-8") as f: # text = f.read() path_chatgpt_output = "chatgpt_data_v1" path = "data/{}/paper_prompt_title_1_1/sencend_zhaiyao_prompt_data.txt".format(path_chatgpt_output) # path = r"E:\pycharm_workspace\mulu_ner\data\chatgpt_data_v1\paper_prompt_title_1_1\title_zhaiyao_prompt_data.txt" with open(path, encoding="utf-8") as f: text = f.read() text_list = text.split("@" * 20) data_list = [] chinese_keyword_data_list = [] for text_dan in tqdm(text_list): # print(text_dan) try: title_prompt, zhaiyao = text_dan.split("*" * 20) except: continue zhaiyao = str(zhaiyao).strip("\n") data_list.append(prompt.format(zhaiyao)) chinese_keyword_data_list.append(chinese_keyword_prompt.format(zhaiyao)) import random random.shuffle(data_list) with open("./data/zhaiyao_to_/zhaiyao_fanyi_prompt.txt", mode="w", encoding="utf-8") as f: for i in data_list: f.write(json.dumps(i, ensure_ascii=False)) f.write("\n") random.shuffle(chinese_keyword_data_list) with open("./data/zhaiyao_to_/zhaiyao_chinese_keyword_prompt.txt", mode="w", encoding="utf-8") as f: for i in chinese_keyword_data_list: f.write(json.dumps(i, ensure_ascii=False)) f.write("\n") # for lable in table_of_contents: # text_len = len(paper_text) # dan_nerlable = [text_len, text_len + len(lable[0]), lable[1]] # nerlable_list.append(dan_nerlable) # paper_text += lable[0] # paper_text += "@" # # paper_dan = {"text": paper_text, "label": nerlable_list} # # ner_lable.append(str(table_of_contents)) # text_zong.append(paper_dan) # # with open("../data/train.txt", mode="w", encoding="utf-8") as f: # for i in text_zong: # f.write(json.dumps(i, ensure_ascii=False)) # f.write("\n") # # # with open("../data/train_lable.txt", mode="w") as f: # for i in ner_lable: # f.write(json.dumps(i, ensure_ascii=False)) # f.write("\n")