数据处理代码,为了生成chatgpt数据
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

155 lines
4.8 KiB

import time
from tqdm import tqdm
import random
import requests
import json
import threading
from threading import Thread
import redis
import re
def is_contains_chinese(strs):
for _char in strs:
if '\u4e00' <= _char <= '\u9fa5':
return True
return False
lock = threading.RLock()
pool = redis.ConnectionPool(host='104.244.90.248', port=63179, max_connections=50, db=11, password='Zhicheng123*')
redis_ = redis.Redis(connection_pool=pool, decode_responses=True)
with open("api_key.txt", "r",) as f:
a = f.read()
a = a.split("\n")
pantten_title = "《(.*)》"
redis_key_name_openaikey_list = "openaikey_list"
redis_zirenwu = "redis_zirenwu"
api_key_list = []
for i in a:
api_key_list.append(str(i.split("----")[-1]))
for i in api_key_list:
redis_.rpush(redis_key_name_openaikey_list, i)
lock = threading.RLock()
prompt_dict = {
"mulu_prompt": "论文题目为“{}”,以“{}”为论文的生成方向,为论文生成目录,要求只有一级标题和二级标题,一级标题使用中文数字 例如一、xxx;二级标题使用阿拉伯数字 例如1.1 xxx;一级标题不少于7个;每个一级标题至少包含3个二级标题",
"yanjiubeijingyiyi_prompt": "请分别写出以《{}》为课题,以“{}”为论文的生成方向,生成论文的研究背景和意义,字数不少于1000字",
"jianjie_prompt": "请帮我生成《{}》为题目,以“{}”为论文的生成方向,生成论文研究内容,包括整体简介和分最少三个方面总结"
}
with open("./data/paper_prompt_title_3/title_zhuyaoneirong_prompt_data.txt", encoding="utf-8") as f:
text = f.read()
text_list = text.split("\n")
title_list = []
for i in text_list:
if "**************" in i:
title_list.append(i.split("**************")[1])
random.shuffle(title_list)
print(len(title_list))
zirenwu_list = []
for text in title_list:
bool_ = is_contains_chinese(text)
if bool_ == False:
continue
if "》:" not in text:
continue
text = text.strip("\"").strip("").strip("")
result_biaoti_list = re.findall(pantten_title, text)
try:
title = result_biaoti_list[0]
except:
continue
hexinnrirong = text.split("》:")[1]
for prompt in prompt_dict:
zirenwu_list.append((prompt, str(prompt_dict[prompt]).format(title, hexinnrirong)))
for i in zirenwu_list:
redis_.rpush(redis_zirenwu, str(i))
def request_api_chatgpt(api_key, task_type, prompt):
global api_key_list
global zirenwu_list
try:
OPENAI_API_KEY = api_key
url = "https://api.openai.com/v1/chat/completions"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {OPENAI_API_KEY}"
}
data = {
"model": "gpt-3.5-turbo",
"messages": [
{"role": "user", "content": prompt},
],
"temperature": 0.5
}
response = requests.post(url,
headers=headers,
data=json.dumps(data),
timeout=240)
res = response.json()
text = res["choices"][0]["message"]["content"]
lock.acquire()
# api_key_list.append(api_key)
redis_.rpush(redis_key_name_openaikey_list, api_key)
with open("/home/majiahui/mulu_ner/data/paper_prompt_title_hexin_3/title_{}_data.txt".format(task_type), mode="a") as f:
f.write("@@@@@@@@@@@@@@@@@@@@@@@")
f.write(prompt)
f.write("**************")
f.write(text)
f.write("\n")
lock.release()
except:
print("task_type_bad", task_type)
print("api_key_bad", api_key)
time.sleep(5)
lock.acquire()
redis_.rpush(redis_key_name_openaikey_list, api_key)
redis_.rpush(redis_zirenwu, str((task_type, prompt)))
lock.release()
if __name__ == '__main__':
while True:
if redis_.llen(redis_zirenwu) == 0:
time.sleep(1)
continue
elif redis_.llen(redis_zirenwu) != 0 and redis_.llen(redis_key_name_openaikey_list) != 0:
lock.acquire()
api_key = redis_.lpop(redis_key_name_openaikey_list)
api_key = api_key.decode('UTF-8')
dan_zirenwu = redis_.lpop(redis_zirenwu)
dan_zirenwu = dan_zirenwu.decode('UTF-8')
lock.release()
# dan_zirenwu = zirenwu_list.pop(0)
dan_zirenwu = eval(dan_zirenwu)
task_type, prompt = dan_zirenwu[0], dan_zirenwu[1]
t = Thread(target=request_api_chatgpt, args=(api_key, task_type, prompt))
t.start()
elif redis_.llen(redis_key_name_openaikey_list) == 0:
time.sleep(1)
continue
else:
time.sleep(1)
continue