数据处理代码,为了生成chatgpt数据
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

158 lines
5.2 KiB

import time
from tqdm import tqdm
import random
import requests
import json
import threading
from threading import Thread
import redis
import re
def is_contains_chinese(strs):
for _char in strs:
if '\u4e00' <= _char <= '\u9fa5':
return True
return False
lock = threading.RLock()
pool = redis.ConnectionPool(host='104.244.90.248', port=63179, max_connections=50, db=11, password='Zhicheng123*')
redis_ = redis.Redis(connection_pool=pool, decode_responses=True)
with open("api_key.txt", "r",) as f:
a = f.read()
a = a.split("\n")
pantten_title = "《(.*)》"
redis_key_name_openaikey_list = "openaikey_list"
redis_zirenwu = "redis_zirenwu"
api_key_list = []
for i in a:
api_key_list.append(str(i.split("----")[-1]))
for i in api_key_list:
redis_.rpush(redis_key_name_openaikey_list, i)
lock = threading.RLock()
prompt_dict = {
"mulu_prompt": "以“{}”为论文的生成方向,为论文题目为“{}”生成目录,要求只有一级标题和二级标题,一级标题使用中文数字 例如一、xxx;二级标题使用阿拉伯数字 例如1.1 xxx;一级标题不少于7个;每个一级标题至少包含3个二级标题",
"beijing_prompt": "以“{}”为论文题目,以“{}”为论文的生成方向,写一段题目来源的背景,要求字数在200字以内",
"zongjie_prompt": "以“{}”为论文题目,以“{}”为论文的生成方向,写一个论文简短总结,要求在300字以内",
"zongshu_prompt": "以《{}》为课题,以“{}”为论文的生成方向,请写出这篇论文的国内外研究状况综述,字数在800字左右",
"yanjiubeijingyiyi_prompt": "请分别写出以《{}》为课题的研究背景和意义,字数不少于1000字",
"jianjie_prompt": "请帮我生成《{}》为题目的研究内容,包括整体简介和分最少三个方面总结"
}
with open("./data/paper_prompt_title_3/title_zhuyaoneirong_prompt_data.txt", encoding="utf-8") as f:
text = f.read()
text_list = text.split("\n")
title_list = []
for i in text_list:
if "**************" in i:
title_list.append(i.split("**************")[1])
random.shuffle(title_list)
print(len(title_list))
zirenwu_list = []
for text in title_list:
bool_ = is_contains_chinese(text)
if bool_ == False:
continue
if "》:" not in text:
continue
text = text.strip("\"").strip("").strip("")
result_biaoti_list = re.findall(pantten_title, text)
try:
title = result_biaoti_list[0]
except:
continue
hexinnrirong = text.split("》:")[1]
for prompt in prompt_dict:
zirenwu_list.append((prompt, str(prompt_dict[prompt]).format(title, hexinnrirong)))
for i in zirenwu_list:
redis_.rpush(redis_zirenwu, str(i))
def request_api_chatgpt(api_key, task_type, prompt):
global api_key_list
global zirenwu_list
try:
OPENAI_API_KEY = api_key
url = "https://api.openai.com/v1/chat/completions"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {OPENAI_API_KEY}"
}
data = {
"model": "gpt-3.5-turbo",
"messages": [
{"role": "user", "content": prompt},
],
"temperature": 0.5
}
response = requests.post(url,
headers=headers,
data=json.dumps(data),
timeout=240)
res = response.json()
text = res["choices"][0]["message"]["content"]
lock.acquire()
# api_key_list.append(api_key)
redis_.rpush(redis_key_name_openaikey_list, api_key)
with open("/home/majiahui/mulu_ner/data/paper_prompt_title_hexin_3/title_{}_data.txt".format(task_type), mode="a") as f:
f.write("@@@@@@@@@@@@@@@@@@@@@@@")
f.write(prompt)
f.write("**************")
f.write(text)
f.write("\n")
lock.release()
except:
print("task_type_bad", task_type)
print("api_key_bad", api_key)
time.sleep(5)
lock.acquire()
redis_.rpush(redis_key_name_openaikey_list, api_key)
redis_.rpush(redis_zirenwu, str((task_type, prompt)))
lock.release()
if __name__ == '__main__':
while True:
if redis_.llen(redis_zirenwu) == 0:
time.sleep(1)
continue
elif redis_.llen(redis_zirenwu) != 0 and redis_.llen(redis_key_name_openaikey_list) != 0:
lock.acquire()
api_key = redis_.lpop(redis_key_name_openaikey_list)
api_key = api_key.decode('UTF-8')
dan_zirenwu = redis_.lpop(redis_zirenwu)
dan_zirenwu = dan_zirenwu.decode('UTF-8')
lock.release()
# dan_zirenwu = zirenwu_list.pop(0)
dan_zirenwu = eval(dan_zirenwu)
task_type, prompt = dan_zirenwu[0], dan_zirenwu[1]
t = Thread(target=request_api_chatgpt, args=(api_key, task_type, prompt))
t.start()
elif redis_.llen(redis_key_name_openaikey_list) == 0:
time.sleep(1)
continue
else:
time.sleep(1)
continue