数据处理代码,为了生成chatgpt数据
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

165 lines
6.4 KiB

import os
import json
import re
import math
import numpy as np
from tqdm import tqdm
def is_contains_chinese(strs):
for _char in strs:
if '\u4e00' <= _char <= '\u9fa5':
return True
return False
# pantten_second_biaoti = '[2二ⅡⅠ][、.]\s{0,}?[\u4e00-\u9fa5]+'
lable_data_amount = {
"title_beijing_prompt_data.txt": {"num_token": 5000, "prompt": "生成论文来源的背景#"},
"title_jianjie_prompt_data.txt": {"num_token": 5000, "prompt": "生成研究内容#"},
"title_mulu_prompt_data.txt": {"num_token": 5000, "prompt": "生成目录#"},
"title_yanjiubeijingyiyi_prompt_data.txt": {"num_token": 5000, "prompt": "生成课题的研究背景和意义#"},
"title_zongjie_prompt_data.txt": {"num_token": 5000, "prompt": "生成论文简短总结#"},
"title_zongshu_prompt_data.txt": {"num_token": 5000, "prompt": "生成课题的国内外研究状况综述#"},
"jianjie_task_book_prompt_data.txt": {"num_token": 5000, "prompt": "生成6点本篇论文应完成的主要内容#"},
"title_mulu_references_prompt_data.txt": {"num_token": 1, "prompt": "生成参考文献#"},
"title_mulu_small_title_prompt_shuffle_data.txt": {"num_token": -1, "prompt": "生成论文小标题内容#"},
"title_mulu_zhaiyao_data.txt": {"num_token": 5000, "prompt": "生成论文摘要#"},
"zhaiyao_chinese_keyword_prompt_data.txt": {"num_token": 5000, "prompt": "生成关键字#"},
"zhaiyao_fanyi_prompt_data.txt": {"num_token": 5000, "prompt": "翻译摘要#"},
"chinese_keyword_en_prompt_data.txt": {"num_token": 5000, "prompt": "翻译关键词#"},
"title_hexin_beijing_prompt_data.txt": {"num_token": 4971, "prompt": "生成论文来源的背景#"},
"title_hexin_jianjie_prompt_data.txt": {"num_token": 4903, "prompt": "生成研究内容#"},
"title_hexin_mulu_prompt_data.txt": {"num_token": 4954, "prompt": "生成目录#"},
"title_hexin_yanjiubeijingyiyi_prompt_data.txt": {"num_token": 4902, "prompt": "生成课题的研究背景和意义#"},
"title_hexin_zongjie_prompt_data.txt": {"num_token": 4971, "prompt": "生成论文简短总结#"},
"title_hexin_zongshu_prompt_data.txt": {"num_token": 4671, "prompt": "生成课题的国内外研究状况综述#"}
}
re_file = {
"title_beijing_prompt_data.txt": "\n以“",
"title_jianjie_prompt_data.txt": "\n请帮我生成《",
"title_mulu_prompt_data.txt": "\n为论文题目“",
"title_yanjiubeijingyiyi_prompt_data.txt": "\n请分别写出以《",
"title_zongjie_prompt_data.txt": "\n以“",
"title_zongshu_prompt_data.txt": "\n请写出以《",
"jianjie_task_book_prompt_data.txt": "\n\"请根据题目为《",
"title_mulu_references_prompt_data.txt": "\n\"论文题目是“",
"zhaiyao_chinese_keyword_prompt_data.txt": "\n\"请为“",
"zhaiyao_fanyi_prompt_data.txt": "\n\"请把“",
"chinese_keyword_en_prompt_data.txt": "\n\"请把“",
"title_mulu_zhaiyao_data.txt": "@@@@@@@@@@@@@@@@@@",
"title_mulu_small_title_prompt_shuffle_data.txt": "@@@@@@@@@@@@@@@@@@",
"title_hexin_beijing_prompt_data.txt": "@@@@@@@@@@@@@@@@@@@@@@@",
"title_hexin_jianjie_prompt_data.txt": "@@@@@@@@@@@@@@@@@@@@@@@",
"title_hexin_mulu_prompt_data.txt": "@@@@@@@@@@@@@@@@@@@@@@@",
"title_hexin_yanjiubeijingyiyi_prompt_data.txt": "@@@@@@@@@@@@@@@@@@@@@@@",
"title_hexin_zongjie_prompt_data.txt": "@@@@@@@@@@@@@@@@@@@@@@@",
"title_hexin_zongshu_prompt_data.txt": "@@@@@@@@@@@@@@@@@@@@@@@"
}
split_teshu = [
"title_mulu_zhaiyao_data.txt",
"title_mulu_small_title_prompt_shuffle_data.txt",
"title_hexin_beijing_prompt_data.txt",
"title_hexin_jianjie_prompt_data.txt",
"title_hexin_mulu_prompt_data.txt",
"title_hexin_yanjiubeijingyiyi_prompt_data.txt",
"title_hexin_zongjie_prompt_data.txt",
"title_hexin_zongshu_prompt_data.txt"
]
path_list = []
file = "./data/paper_prompt_title_3"
for root, dirs, files in os.walk(file):
for file in files:
path = os.path.join(root, file)
path_list.append(path)
file = "./data/paper_prompt_title_3_1"
for root, dirs, files in os.walk(file):
for file in files:
path = os.path.join(root, file)
path_list.append(path)
file = "./data/paper_prompt_title_3_1_1"
for root, dirs, files in os.walk(file):
for file in files:
path = os.path.join(root, file)
path_list.append(path)
file = "./data/paper_prompt_title_hexin_3"
for root, dirs, files in os.walk(file):
for file in files:
path = os.path.join(root, file)
path_list.append(path)
text_list_new = []
tongji = {}
for path in path_list:
task_name = path.split("\\")[-1]
if task_name in re_file:
spilt_dan = re_file[task_name]
else:
continue
train_data_amount_dict = lable_data_amount[task_name]
train_data_amount = train_data_amount_dict["num_token"]
prompt = train_data_amount_dict["prompt"]
with open(path, encoding="utf-8") as f:
text = f.read()
text_list = text.split(spilt_dan)
index = 1
if train_data_amount == -1:
train_data_amount = len(text_list) -1
while True:
if index >= train_data_amount:
break
data_dan = text_list[index]
if "**************" in data_dan:
# if task_name == "title_jianjie_prompt_data.txt":
# content, summary = data_dan.split("**************")
# bool_ = is_contains_chinese(summary)
# if bool_ == False:
# index += 1
# continue
if task_name in split_teshu:
data_dan = data_dan
else:
data_dan = spilt_dan[1:] + data_dan
text_list_new.append((data_dan, prompt))
index += 1
if task_name not in tongji:
tongji[task_name] = 1
else:
tongji[task_name] += 1
else:
index += 4
print(data_dan)
# train_list.append({"content": str(title_p), "summary": str(b)})
train_list = []
for text, prompt in text_list_new:
content, summary = text.split("**************")
train_list.append(
{"query": str(content).strip("\"").strip("\n").strip("\""), "response": str(summary), "prompt": prompt}
)
import random
random.shuffle(train_list)
for i in tongji:
print(i, tongji[i])
with open("./data/chatglm_paper_data_2_prompt.txt", mode="w", encoding="utf-8") as f:
for i in train_list:
f.write(json.dumps(i, ensure_ascii=False))
f.write("\n")