数据处理代码,为了生成chatgpt数据
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

42 lines
981 B

2 years ago
import json
import re
import math
import numpy as np
from tqdm import tqdm
prompt = "请把“{}”这几个关键字翻译成英文"
pantten_title = "(.*?)》为题目生成论文摘要,要求生成的字数在"
path = "./data/paper_prompt_title_3_1/zhaiyao_chinese_keyword_prompt_data.txt"
with open(path, encoding="utf-8") as f:
text = f.read()
# "\n\"请为“",
text_list = text.split("\n\"请为“")
data_list = []
chinese_keyword_data_list = []
for text_dan in tqdm(text_list[1:]):
# print(text_dan)
try:
_ , chinese_keyword = text_dan.split("**************")
except:
continue
chinese_keyword = str(chinese_keyword).strip("\n")
data_list.append(prompt.format(chinese_keyword))
import random
random.shuffle(data_list)
with open("./data/chinese_keyword_to_/chinese_keyword_en_prompt.txt", mode="w", encoding="utf-8") as f:
for i in data_list:
f.write(json.dumps(i, ensure_ascii=False))
f.write("\n")