You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
611 lines
26 KiB
611 lines
26 KiB
![]()
2 years ago
|
import json
|
||
|
import re
|
||
|
from tqdm import tqdm
|
||
|
|
||
|
import os
|
||
|
os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"
|
||
|
import torch
|
||
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||
|
from transformers.generation.utils import GenerationConfig
|
||
|
|
||
|
model_path = "/home/majiahui/project/models-llm/Baichuan-13B-Chat"
|
||
|
|
||
|
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, trust_remote_code=True)
|
||
|
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True)
|
||
|
model.generation_config = GenerationConfig.from_pretrained(model_path)
|
||
|
|
||
|
|
||
|
def predict_baichuan(text):
|
||
|
messages = []
|
||
|
|
||
|
text = "把下面这段翻译成英文\n" + text
|
||
|
messages.append({"role": "user",
|
||
|
"content": text})
|
||
|
|
||
|
response = model.chat(tokenizer, messages)
|
||
|
return response
|
||
|
|
||
|
prompt_tpye_dict = {
|
||
|
"生成论文来源的背景#\n": {
|
||
|
"data_tpye": "Background for generating paper sources#\n",
|
||
|
"title": "以《(.*)》为论文题目",
|
||
|
"en-prompt": "Using ‘{$title}’ as the title of the paper, write a paragraph about the background of the English question source, with a minimum word count of {$wordnum} words"
|
||
|
},
|
||
|
"生成研究内容#\n": {
|
||
|
"data_tpye": "Generate research content#\n",
|
||
|
"title": "请帮我生成《(.*)》为题目",
|
||
|
"en-prompt": "Please help me generate the English research content titled ‘{$title}’, including an overall introduction and a summary of at least three aspects"
|
||
|
},
|
||
|
"生成目录#\n": {
|
||
|
"data_tpye": "Generate directory#\n",
|
||
|
"title": "为论文题目《(.*)》",
|
||
|
"en-prompt": "Generate an English directory for the paper title ‘{$title}’, which requires only the first level title and second level title. The format of the first level title is as follows: 1. xxx; The format of the secondary title is as follows: 1.1 xxx; 4 first level titles; Each first level title should contain at least 3 second level titles;"
|
||
|
},
|
||
|
"生成课题的研究背景和意义#\n": {
|
||
|
"data_tpye": "Generate directory#\n",
|
||
|
"title": "请分别写出以《(.*)》为课题",
|
||
|
"en-prompt": "Please provide the English research background and significance of ‘{$title}’ as the topic, with a minimum of {$wordnum} words"
|
||
|
},
|
||
|
"生成致谢#\n": {
|
||
|
"data_tpye": "Generate acknowledgments#\n",
|
||
|
"title": "请以《(.*)》为题写一篇论文的中文致谢",
|
||
|
"en-prompt": "Write an English thank based on the paper title ‘{$title}’"
|
||
|
},
|
||
|
"生成论文简短总结#\n": {
|
||
|
"data_tpye": "Generate a brief summary of the paper#\n",
|
||
|
"title": "以《(.*)》为论文题目",
|
||
|
"en-prompt": "Write a brief summary of the English paper titled ‘{$title}’, with a requirement of no more than {$wordnum} words"
|
||
|
},
|
||
|
"生成课题的国内外研究状况综述#\n": {
|
||
|
"data_tpye": "Generate a brief summary of the paper#\n",
|
||
|
"title": "请写出以《(.*)》为课题的国内外研究状况综述",
|
||
|
"en-prompt": "Please provide an English summary of the research status of ‘{$title}’ at home and abroad, with a word count of around {$wordnum} words"
|
||
|
},
|
||
|
"生成6点本篇论文应完成的主要内容#\n": {
|
||
|
"data_tpye": "Generate 6 main contents that should be completed in this paper#\n",
|
||
|
"title": "请根据题目为《(.*)》",
|
||
|
"opening_report_main_content": "研究内容为“(.*)”总结出至少6点本篇论文应完成的主要内容",
|
||
|
"en-prompt": "Please summarize at least 6 main English content that should be completed for this paper based on the title ‘{$title}’ and the research content ‘{$opening_report_main_content}’, using Arabic numerals for arrangement"
|
||
|
},
|
||
|
"生成参考文献#\n": {
|
||
|
"data_tpye": "Generate references#\n",
|
||
|
"title": "论文题目是《(.*)》",
|
||
|
"catalogue_str": "目录是“(.*)”",
|
||
|
"en-prompt": "According to the paper title ‘{$title}’ and the directory is ‘{$catalogue_str}’, generate 15 references in the format of [1] xxx."
|
||
|
},
|
||
|
"生成论文小标题内容#\n": {
|
||
|
"data_tpye": "Generate paper subheading content#\n",
|
||
|
"title": "论文题目是《(.*)》",
|
||
|
"catalogue_str": "目录是“(.*)”",
|
||
|
"smell-title": "请把其中的小标题“(.*?)”的内容补充完整",
|
||
|
"en-prompt": "According to the paper title ‘{$title}’ and the table of contents ‘{$catalogue_str}’, add approximately {$wordnum} words of English content to the subheading ‘{$secondray_title}’. The content must include the current subheading and not include other titles in the table of contents"
|
||
|
},
|
||
|
"生成论文摘要#\n": {
|
||
|
"data_tpye": "Generate paper abstract#\n",
|
||
|
"title": "论文题目是《(.*)》",
|
||
|
"catalogue_str": "目录是“(.*)”",
|
||
|
"en-prompt": "Generate a Chinese paper abstract based on the title “{$title}” and the directory “{$catalogue_str}”, with a required word count of around {$wordnum} words"
|
||
|
},
|
||
|
"生成关键字#\n": {
|
||
|
"data_tpye": "Generate Keywords#\n",
|
||
|
"abstract": "请为“(.*)”这段论文摘要生成3-5个关键字",
|
||
|
"en-prompt": "Please generate 3-5 keywords for the abstract of the paper ‘{$abstract}’"
|
||
|
},
|
||
|
"生成论文来源的背景-核心内容#\n": {
|
||
|
"data_tpye": "Background of generating paper sources - core content#\n",
|
||
|
"title": "以《(.*)》为论文题目",
|
||
|
"hexin": "以“(.*)”为论文的研究方向",
|
||
|
"en-prompt": "Based on the paper title ‘{$title}’ and the core content ‘{$this->core_content}’, generate a background of approximately {$wordnum} words of English topic sources"
|
||
|
|
||
|
},
|
||
|
"生成研究内容-核心内容#\n": {
|
||
|
"data_tpye": "Generate research content - core content#\n",
|
||
|
"title": "请帮我生成《(.*)》为题目",
|
||
|
"hexin": "以“(.*)”为论文的研究方向",
|
||
|
"en-prompt": "Generate English research content based on the paper title ‘{$title}’ and core content ‘{$this->core_content}’, including an overall introduction and a summary of at least three aspects"
|
||
|
},
|
||
|
"生成目录-核心内容#\n": {
|
||
|
"data_tpye": "Generate Directory - Core Content#\n",
|
||
|
"title": "为论文题目《(.*)》生成目录",
|
||
|
"hexin": "以“(.*)”为论文的研究方向",
|
||
|
"en-prompt": "Generate an English directory based on the paper title ‘{$title}’ and the core content ‘{$this->core_content}’, with only the first and second level titles required. The format of the first level title is as follows: 1. xxx; The format of the secondary title is as follows: 1.1 xxx; 4 first level titles; Each first level title should contain at least 3 second level titles;"
|
||
|
},
|
||
|
"生成课题的研究背景和意义-核心内容#\n": {
|
||
|
"data_tpye": "Research background and significance of the generated topic - core content#\n",
|
||
|
"title": "请分别写出以《(.*)》为课题",
|
||
|
"hexin": "以“(.*)”为论文的研究方向",
|
||
|
"en-prompt": "Generate an English research background and significance of no less than {$wordnum} words based on the paper title ‘{$title}’ and the core content ‘{$this->core_content}’"
|
||
|
},
|
||
|
"生成论文简短总结-核心内容#\n": {
|
||
|
"data_tpye": "Generate a brief summary of the paper - core content#\n",
|
||
|
"title": "以《(.*)》为论文题目",
|
||
|
"hexin": "以“(.*)”为论文的研究方向",
|
||
|
"en-prompt": "Generate a brief summary of the English paper with approximately {$wordnum} words based on the title ‘{$title}’ and the core content ‘{$this->core_content}’"
|
||
|
},
|
||
|
"生成课题的国内外研究状况综述-核心内容#\n": {
|
||
|
"data_tpye": "Overview of domestic and international research status on generating topics - core content#\n",
|
||
|
"title": "请写出以《(.*)》为课题",
|
||
|
"hexin": "以“(.*)”为论文的研究方向",
|
||
|
"en-prompt": "Based on the paper title ‘{$title}’ and the core content ‘{$this->core_content}’, generate an English summary of research status at home and abroad with no less than {$wordnum} words"
|
||
|
}
|
||
|
}
|
||
|
|
||
|
with open("data/llama_t/chatglm_dev_4_prompt_llama.json") as f:
|
||
|
data = json.loads(f.read())
|
||
|
|
||
|
data_new = []
|
||
|
for i in tqdm(data[:10]):
|
||
|
data_dan_dict = {}
|
||
|
instruction = i["instruction"]
|
||
|
input_ = i["input"]
|
||
|
input_ = str(input_).replace("\n", "\\n")
|
||
|
try:
|
||
|
if instruction == "生成论文来源的背景#\n":
|
||
|
dan_dict = prompt_tpye_dict["生成论文来源的背景#\n"]
|
||
|
en_prompt = dan_dict["en-prompt"]
|
||
|
title = dan_dict["title"]
|
||
|
data_tpye = dan_dict["data_tpye"]
|
||
|
|
||
|
title_re = re.findall(title, input_)
|
||
|
title = title_re[0]
|
||
|
|
||
|
en_title = predict_baichuan(title)
|
||
|
# en_output = predict_baichuan(i["output"])
|
||
|
#
|
||
|
# en_output_list = str(en_output).split(" ")
|
||
|
# zishu = len(en_output_list)//100 * 100
|
||
|
|
||
|
en_prompt = en_prompt.replace("{$title}", en_title).replace("{$wordnum}", "200")
|
||
|
|
||
|
data_dan_dict["instruction"] = data_tpye
|
||
|
data_dan_dict["input"] = en_prompt
|
||
|
# data_dan_dict["output"] = en_output
|
||
|
|
||
|
data_new.append(data_dan_dict)
|
||
|
|
||
|
elif instruction == "生成研究内容#\n":
|
||
|
dan_dict = prompt_tpye_dict["生成研究内容#\n"]
|
||
|
en_prompt = dan_dict["en-prompt"]
|
||
|
title = dan_dict["title"]
|
||
|
data_tpye = dan_dict["data_tpye"]
|
||
|
|
||
|
title_re = re.findall(title, input_)
|
||
|
title = title_re[0]
|
||
|
|
||
|
en_title = predict_baichuan(title)
|
||
|
# en_output = predict_baichuan(i["output"])
|
||
|
|
||
|
en_prompt = en_prompt.replace("{$title}", en_title)
|
||
|
|
||
|
data_dan_dict["instruction"] = data_tpye
|
||
|
data_dan_dict["input"] = en_prompt
|
||
|
# data_dan_dict["output"] = en_output
|
||
|
|
||
|
data_new.append(data_dan_dict)
|
||
|
|
||
|
|
||
|
elif instruction == "生成目录#\n":
|
||
|
dan_dict = prompt_tpye_dict["生成目录#\n"]
|
||
|
en_prompt = dan_dict["en-prompt"]
|
||
|
title = dan_dict["title"]
|
||
|
data_tpye = dan_dict["data_tpye"]
|
||
|
|
||
|
title_re = re.findall(title, input_)
|
||
|
title = title_re[0]
|
||
|
|
||
|
en_title = predict_baichuan(title)
|
||
|
# en_output = predict_baichuan(i["output"])
|
||
|
|
||
|
en_prompt = en_prompt.replace("{$title}", en_title)
|
||
|
|
||
|
data_dan_dict["instruction"] = data_tpye
|
||
|
data_dan_dict["input"] = en_prompt
|
||
|
# data_dan_dict["output"] = en_output
|
||
|
|
||
|
data_new.append(data_dan_dict)
|
||
|
|
||
|
elif instruction == "生成课题的研究背景和意义#\n":
|
||
|
dan_dict = prompt_tpye_dict["生成课题的研究背景和意义#\n"]
|
||
|
en_prompt = dan_dict["en-prompt"]
|
||
|
title = dan_dict["title"]
|
||
|
data_tpye = dan_dict["data_tpye"]
|
||
|
|
||
|
title_re = re.findall(title, input_)
|
||
|
title = title_re[0]
|
||
|
|
||
|
en_title = predict_baichuan(title)
|
||
|
# en_output = predict_baichuan(i["output"])
|
||
|
#
|
||
|
# en_output_list = str(en_output).split(" ")
|
||
|
# zishu = len(en_output_list)//100 * 100
|
||
|
|
||
|
en_prompt = en_prompt.replace("{$title}", en_title).replace("{$wordnum}", "1000")
|
||
|
|
||
|
data_dan_dict["instruction"] = data_tpye
|
||
|
data_dan_dict["input"] = en_prompt
|
||
|
# data_dan_dict["output"] = en_output
|
||
|
|
||
|
data_new.append(data_dan_dict)
|
||
|
|
||
|
elif instruction == "生成致谢#\n":
|
||
|
dan_dict = prompt_tpye_dict["生成致谢#\n"]
|
||
|
en_prompt = dan_dict["en-prompt"]
|
||
|
title = dan_dict["title"]
|
||
|
data_tpye = dan_dict["data_tpye"]
|
||
|
|
||
|
title_re = re.findall(title, input_)
|
||
|
title = title_re[0]
|
||
|
|
||
|
en_title = predict_baichuan(title)
|
||
|
# en_output = predict_baichuan(i["output"])
|
||
|
|
||
|
en_prompt = en_prompt.replace("{$title}", en_title)
|
||
|
|
||
|
data_dan_dict["instruction"] = data_tpye
|
||
|
data_dan_dict["input"] = en_prompt
|
||
|
# data_dan_dict["output"] = en_output
|
||
|
|
||
|
data_new.append(data_dan_dict)
|
||
|
|
||
|
elif instruction == "生成论文简短总结#\n":
|
||
|
dan_dict = prompt_tpye_dict["生成论文简短总结#\n"]
|
||
|
en_prompt = dan_dict["en-prompt"]
|
||
|
title = dan_dict["title"]
|
||
|
data_tpye = dan_dict["data_tpye"]
|
||
|
|
||
|
title_re = re.findall(title, input_)
|
||
|
title = title_re[0]
|
||
|
|
||
|
en_title = predict_baichuan(title)
|
||
|
# en_output = predict_baichuan(i["output"])
|
||
|
#
|
||
|
# en_output_list = str(en_output).split(" ")
|
||
|
# zishu = len(en_output_list)//100 * 100
|
||
|
|
||
|
en_prompt = en_prompt.replace("{$title}", en_title).replace("{$wordnum}", "300")
|
||
|
|
||
|
data_dan_dict["instruction"] = data_tpye
|
||
|
data_dan_dict["input"] = en_prompt
|
||
|
# data_dan_dict["output"] = en_output
|
||
|
|
||
|
data_new.append(data_dan_dict)
|
||
|
|
||
|
elif instruction == "生成课题的国内外研究状况综述#\n":
|
||
|
dan_dict = prompt_tpye_dict["生成课题的国内外研究状况综述#\n"]
|
||
|
en_prompt = dan_dict["en-prompt"]
|
||
|
title = dan_dict["title"]
|
||
|
data_tpye = dan_dict["data_tpye"]
|
||
|
|
||
|
title_re = re.findall(title, input_)
|
||
|
title = title_re[0]
|
||
|
|
||
|
en_title = predict_baichuan(title)
|
||
|
# en_output = predict_baichuan(i["output"])
|
||
|
#
|
||
|
# en_output_list = str(en_output).split(" ")
|
||
|
# zishu = len(en_output_list)//100 * 100
|
||
|
|
||
|
en_prompt = en_prompt.replace("{$title}", en_title).replace("{$wordnum}", "800")
|
||
|
|
||
|
data_dan_dict["instruction"] = data_tpye
|
||
|
data_dan_dict["input"] = en_prompt
|
||
|
# data_dan_dict["output"] = en_output
|
||
|
|
||
|
data_new.append(data_dan_dict)
|
||
|
|
||
|
elif instruction == "生成6点本篇论文应完成的主要内容#\n":
|
||
|
dan_dict = prompt_tpye_dict["生成6点本篇论文应完成的主要内容#\n"]
|
||
|
en_prompt = dan_dict["en-prompt"]
|
||
|
title = dan_dict["title"]
|
||
|
opening_report_main_content = dan_dict["opening_report_main_content"]
|
||
|
data_tpye = dan_dict["data_tpye"]
|
||
|
|
||
|
title_re = re.findall(title, input_)
|
||
|
title = title_re[0]
|
||
|
|
||
|
opening_report_main_content_re = re.findall(opening_report_main_content, input_)
|
||
|
opening_report_main_content = opening_report_main_content_re[0]
|
||
|
opening_report_main_content = opening_report_main_content.replace("\\n", "\n")
|
||
|
|
||
|
en_title = predict_baichuan(title)
|
||
|
en_opening_report_main_content = predict_baichuan(opening_report_main_content)
|
||
|
# en_output = predict_baichuan(i["output"])
|
||
|
|
||
|
en_prompt = en_prompt.replace("{$title}", en_title).replace("{$opening_report_main_content}", en_opening_report_main_content)
|
||
|
|
||
|
data_dan_dict["instruction"] = data_tpye
|
||
|
data_dan_dict["input"] = en_prompt
|
||
|
# data_dan_dict["output"] = en_output
|
||
|
|
||
|
data_new.append(data_dan_dict)
|
||
|
|
||
|
elif instruction == "生成参考文献#\n":
|
||
|
dan_dict = prompt_tpye_dict["生成参考文献#\n"]
|
||
|
en_prompt = dan_dict["en-prompt"]
|
||
|
title = dan_dict["title"]
|
||
|
catalogue_str = dan_dict["catalogue_str"]
|
||
|
data_tpye = dan_dict["data_tpye"]
|
||
|
|
||
|
title_re = re.findall(title, input_)
|
||
|
title = title_re[0]
|
||
|
|
||
|
catalogue_str_re = re.findall(catalogue_str, input_)
|
||
|
catalogue_str = catalogue_str_re[0]
|
||
|
catalogue_str = catalogue_str.replace("\\n", "\n")
|
||
|
|
||
|
en_title = predict_baichuan(title)
|
||
|
en_catalogue_str = predict_baichuan(catalogue_str)
|
||
|
# en_output = predict_baichuan(i["output"])
|
||
|
|
||
|
en_prompt = en_prompt.replace("{$title}", en_title).replace("{$catalogue_str}", en_catalogue_str)
|
||
|
|
||
|
data_dan_dict["instruction"] = data_tpye
|
||
|
data_dan_dict["input"] = en_prompt
|
||
|
# data_dan_dict["output"] = en_output
|
||
|
|
||
|
data_new.append(data_dan_dict)
|
||
|
|
||
|
elif instruction == "生成论文小标题内容#\n":
|
||
|
print(1)
|
||
|
dan_dict = prompt_tpye_dict["生成论文小标题内容#\n"]
|
||
|
en_prompt = dan_dict["en-prompt"]
|
||
|
title = dan_dict["title"]
|
||
|
catalogue_str = dan_dict["catalogue_str"]
|
||
|
smell_title = dan_dict["smell-title"]
|
||
|
data_tpye = dan_dict["data_tpye"]
|
||
|
|
||
|
title_re = re.findall(title, input_)
|
||
|
title = title_re[0]
|
||
|
|
||
|
catalogue_str_re = re.findall(catalogue_str, input_)
|
||
|
catalogue_str = catalogue_str_re[0]
|
||
|
catalogue_str = catalogue_str.replace("\\n", "\n")
|
||
|
|
||
|
smell_title_re = re.findall(smell_title, input_)
|
||
|
print(smell_title_re)
|
||
|
if smell_title_re == []:
|
||
|
continue
|
||
|
smell_title = smell_title_re[0]
|
||
|
smell_title = smell_title.replace("\\n", "\n")
|
||
|
|
||
|
en_title = predict_baichuan(title)
|
||
|
en_catalogue_str = predict_baichuan(catalogue_str)
|
||
|
en_smell_title = predict_baichuan(smell_title)
|
||
|
# en_output = predict_baichuan(i["output"])
|
||
|
#
|
||
|
# en_output_list = str(en_output).split(" ")
|
||
|
# zishu = len(en_output_list)//100 * 100
|
||
|
|
||
|
en_prompt = en_prompt.replace("{$title}", en_title).replace("{$catalogue_str}", en_catalogue_str).replace("{$secondray_title}", en_smell_title).replace("{$wordnum}", "800")
|
||
|
|
||
|
data_dan_dict["instruction"] = data_tpye
|
||
|
data_dan_dict["input"] = en_prompt
|
||
|
# data_dan_dict["output"] = en_output
|
||
|
|
||
|
data_new.append(data_dan_dict)
|
||
|
|
||
|
elif instruction == "生成论文摘要#\n":
|
||
|
dan_dict = prompt_tpye_dict["生成论文摘要#\n"]
|
||
|
en_prompt = dan_dict["en-prompt"]
|
||
|
title = dan_dict["title"]
|
||
|
catalogue_str = dan_dict["catalogue_str"]
|
||
|
data_tpye = dan_dict["data_tpye"]
|
||
|
|
||
|
title_re = re.findall(title, input_)
|
||
|
title = title_re[0]
|
||
|
|
||
|
catalogue_str_re = re.findall(catalogue_str, input_)
|
||
|
catalogue_str = catalogue_str_re[0]
|
||
|
catalogue_str = catalogue_str.replace("\\n", "\n")
|
||
|
|
||
|
en_title = predict_baichuan(title)
|
||
|
en_catalogue_str = predict_baichuan(catalogue_str)
|
||
|
# en_output = predict_baichuan(i["output"])
|
||
|
#
|
||
|
# en_output_list = str(en_output).split(" ")
|
||
|
# zishu = len(en_output_list)//100 * 100
|
||
|
|
||
|
en_prompt = en_prompt.replace("{$title}", en_title).replace("{$catalogue_str}", en_catalogue_str).replace("{$wordnum}", "400")
|
||
|
|
||
|
data_dan_dict["instruction"] = data_tpye
|
||
|
data_dan_dict["input"] = en_prompt
|
||
|
# data_dan_dict["output"] = en_output
|
||
|
|
||
|
data_new.append(data_dan_dict)
|
||
|
|
||
|
elif instruction == "生成关键字#\n":
|
||
|
dan_dict = prompt_tpye_dict["生成关键字#\n"]
|
||
|
en_prompt = dan_dict["en-prompt"]
|
||
|
abstract = dan_dict["abstract"]
|
||
|
catalogue_str = dan_dict["catalogue_str"]
|
||
|
data_tpye = dan_dict["data_tpye"]
|
||
|
|
||
|
abstract_re = re.findall(abstract, input_)
|
||
|
abstract = abstract_re[0]
|
||
|
abstract = abstract.replace("\\n", "\n")
|
||
|
|
||
|
en_abstract = predict_baichuan(abstract)
|
||
|
# en_output = predict_baichuan(i["output"])
|
||
|
|
||
|
en_prompt = en_prompt.replace("{$abstract}", en_abstract)
|
||
|
|
||
|
data_dan_dict["instruction"] = data_tpye
|
||
|
data_dan_dict["input"] = en_prompt
|
||
|
# data_dan_dict["output"] = en_output
|
||
|
|
||
|
data_new.append(data_dan_dict)
|
||
|
|
||
|
elif instruction == "生成论文来源的背景-核心内容#\n":
|
||
|
dan_dict = prompt_tpye_dict["生成论文来源的背景-核心内容#\n"]
|
||
|
en_prompt = dan_dict["en-prompt"]
|
||
|
title = dan_dict["title"]
|
||
|
hexin = dan_dict["hexin"]
|
||
|
data_tpye = dan_dict["data_tpye"]
|
||
|
|
||
|
title_re = re.findall(title, input_)
|
||
|
title = title_re[0]
|
||
|
|
||
|
hexin_re = re.findall(hexin, input_)
|
||
|
hexin = hexin_re[0]
|
||
|
hexin = hexin.replace("\\n", "\n")
|
||
|
|
||
|
en_title = predict_baichuan(title)
|
||
|
en_hexin = predict_baichuan(hexin)
|
||
|
# en_output = predict_baichuan(i["output"])
|
||
|
#
|
||
|
# en_output_list = str(en_output).split(" ")
|
||
|
# zishu = len(en_output_list)//100 * 100
|
||
|
|
||
|
en_prompt = en_prompt.replace("{$title}", en_title).replace("{$wordnum}", "200").replace("{$this->core_content}", en_hexin)
|
||
|
|
||
|
data_dan_dict["instruction"] = data_tpye
|
||
|
data_dan_dict["input"] = en_prompt
|
||
|
# data_dan_dict["output"] = en_output
|
||
|
|
||
|
data_new.append(data_dan_dict)
|
||
|
|
||
|
elif instruction == "生成研究内容-核心内容#\n":
|
||
|
dan_dict = prompt_tpye_dict["生成研究内容-核心内容#\n"]
|
||
|
en_prompt = dan_dict["en-prompt"]
|
||
|
title = dan_dict["title"]
|
||
|
hexin = dan_dict["hexin"]
|
||
|
data_tpye = dan_dict["data_tpye"]
|
||
|
|
||
|
title_re = re.findall(title, input_)
|
||
|
title = title_re[0]
|
||
|
|
||
|
hexin_re = re.findall(hexin, input_)
|
||
|
hexin = hexin_re[0]
|
||
|
hexin = hexin.replace("\\n", "\n")
|
||
|
|
||
|
en_title = predict_baichuan(title)
|
||
|
en_hexin = predict_baichuan(hexin)
|
||
|
# en_output = predict_baichuan(i["output"])
|
||
|
|
||
|
en_prompt = en_prompt.replace("{$title}", en_title).replace("{$this->core_content}", en_hexin)
|
||
|
|
||
|
data_dan_dict["instruction"] = data_tpye
|
||
|
data_dan_dict["input"] = en_prompt
|
||
|
# data_dan_dict["output"] = en_output
|
||
|
|
||
|
data_new.append(data_dan_dict)
|
||
|
|
||
|
elif instruction == "生成目录-核心内容#\n":
|
||
|
dan_dict = prompt_tpye_dict["生成目录-核心内容#\n"]
|
||
|
en_prompt = dan_dict["en-prompt"]
|
||
|
title = dan_dict["title"]
|
||
|
hexin = dan_dict["hexin"]
|
||
|
data_tpye = dan_dict["data_tpye"]
|
||
|
|
||
|
title_re = re.findall(title, input_)
|
||
|
title = title_re[0]
|
||
|
|
||
|
hexin_re = re.findall(hexin, input_)
|
||
|
hexin = hexin_re[0]
|
||
|
hexin = hexin.replace("\\n", "\n")
|
||
|
|
||
|
en_title = predict_baichuan(title)
|
||
|
en_hexin = predict_baichuan(hexin)
|
||
|
# en_output = predict_baichuan(i["output"])
|
||
|
|
||
|
en_prompt = en_prompt.replace("{$title}", en_title).replace("{$this->core_content}", en_hexin)
|
||
|
|
||
|
data_dan_dict["instruction"] = data_tpye
|
||
|
data_dan_dict["input"] = en_prompt
|
||
|
# data_dan_dict["output"] = en_output
|
||
|
|
||
|
data_new.append(data_dan_dict)
|
||
|
|
||
|
elif instruction == "生成课题的研究背景和意义-核心内容#\n":
|
||
|
dan_dict = prompt_tpye_dict["生成课题的研究背景和意义-核心内容#\n"]
|
||
|
en_prompt = dan_dict["en-prompt"]
|
||
|
title = dan_dict["title"]
|
||
|
hexin = dan_dict["hexin"]
|
||
|
data_tpye = dan_dict["data_tpye"]
|
||
|
|
||
|
title_re = re.findall(title, input_)
|
||
|
title = title_re[0]
|
||
|
|
||
|
hexin_re = re.findall(hexin, input_)
|
||
|
hexin = hexin_re[0]
|
||
|
hexin = hexin.replace("\\n", "\n")
|
||
|
|
||
|
en_title = predict_baichuan(title)
|
||
|
en_hexin = predict_baichuan(hexin)
|
||
|
# en_output = predict_baichuan(i["output"])
|
||
|
#
|
||
|
# en_output_list = str(en_output).split(" ")
|
||
|
# zishu = len(en_output_list)//100 * 100
|
||
|
|
||
|
en_prompt = en_prompt.replace("{$title}", en_title).replace("{$wordnum}", "1000").replace("{$this->core_content}", en_hexin)
|
||
|
|
||
|
data_dan_dict["instruction"] = data_tpye
|
||
|
data_dan_dict["input"] = en_prompt
|
||
|
# data_dan_dict["output"] = en_output
|
||
|
|
||
|
data_new.append(data_dan_dict)
|
||
|
|
||
|
elif instruction == "生成论文简短总结-核心内容#\n":
|
||
|
dan_dict = prompt_tpye_dict["生成论文简短总结-核心内容#\n"]
|
||
|
en_prompt = dan_dict["en-prompt"]
|
||
|
title = dan_dict["title"]
|
||
|
hexin = dan_dict["hexin"]
|
||
|
data_tpye = dan_dict["data_tpye"]
|
||
|
|
||
|
title_re = re.findall(title, input_)
|
||
|
title = title_re[0]
|
||
|
|
||
|
hexin_re = re.findall(hexin, input_)
|
||
|
hexin = hexin_re[0]
|
||
|
hexin = hexin.replace("\\n", "\n")
|
||
|
|
||
|
en_title = predict_baichuan(title)
|
||
|
en_hexin = predict_baichuan(hexin)
|
||
|
# en_output = predict_baichuan(i["output"])
|
||
|
#
|
||
|
# en_output_list = str(en_output).split(" ")
|
||
|
# zishu = len(en_output_list)//100 * 100
|
||
|
|
||
|
en_prompt = en_prompt.replace("{$title}", en_title).replace("{$wordnum}", "300").replace("{$this->core_content}", en_hexin)
|
||
|
|
||
|
data_dan_dict["instruction"] = data_tpye
|
||
|
data_dan_dict["input"] = en_prompt
|
||
|
# data_dan_dict["output"] = en_output
|
||
|
|
||
|
data_new.append(data_dan_dict)
|
||
|
|
||
|
elif instruction == "生成课题的国内外研究状况综述-核心内容#\n":
|
||
|
dan_dict = prompt_tpye_dict["生成课题的国内外研究状况综述-核心内容#\n"]
|
||
|
en_prompt = dan_dict["en-prompt"]
|
||
|
title = dan_dict["title"]
|
||
|
hexin = dan_dict["hexin"]
|
||
|
data_tpye = dan_dict["data_tpye"]
|
||
|
|
||
|
title_re = re.findall(title, input_)
|
||
|
title = title_re[0]
|
||
|
|
||
|
hexin_re = re.findall(hexin, input_)
|
||
|
hexin = hexin_re[0]
|
||
|
hexin = hexin.replace("\\n", "\n")
|
||
|
|
||
|
en_title = predict_baichuan(title)
|
||
|
en_hexin = predict_baichuan(hexin)
|
||
|
# en_output = predict_baichuan(i["output"])
|
||
|
#
|
||
|
# en_output_list = str(en_output).split(" ")
|
||
|
# zishu = len(en_output_list)//100 * 100
|
||
|
|
||
|
en_prompt = en_prompt.replace("{$title}", en_title).replace("{$wordnum}", "800").replace("{$this->core_content}", en_hexin)
|
||
|
|
||
|
data_dan_dict["instruction"] = data_tpye
|
||
|
data_dan_dict["input"] = en_prompt
|
||
|
# data_dan_dict["output"] = en_output
|
||
|
|
||
|
data_new.append(data_dan_dict)
|
||
|
except:
|
||
|
print(i)
|
||
|
|
||
|
with open("./data/llama_t/chatglm_en.json", mode="w", encoding="utf-8") as f:
|
||
|
f.write(json.dumps(data_new, ensure_ascii=False, indent=2))
|
||
|
|