import os os.environ["CUDA_VISIBLE_DEVICES"] = "0" import json import numpy as np from tqdm import tqdm from sentence_transformers import SentenceTransformer model = SentenceTransformer('Dmeta-embedding-zh') print(1) with open("data/discipline_types.json", encoding="utf-8") as f: lable_discipline_types = json.loads(f.read()) def erjimul_ulit(): pass def shengcehng_array(data): embs = model.encode(data, normalize_embeddings=True) return embs if __name__ == '__main__': # data = [] with open("data/data_0416.json", encoding="utf-8") as f: # for i in f.readlines(): # a = json.loads(i) # data.append(a) data = json.loads(f.read()) print(len(data)) a = 0 a_ = 0 data_info = {} # 作者 论文名称 论文类别 论文来源 论文年份 摘要 data_prompt = {} for data_dan in data: if str(data_dan["special_topic"]) == "nan": a_ += 1 continue leibie_list = data_dan["special_topic"].split(";") for leibie in leibie_list: if leibie in lable_discipline_types: if lable_discipline_types[leibie] not in data_prompt: data_prompt[lable_discipline_types[leibie]] = ["标题:“{}”,摘要:“{}”".format(data_dan["title"], data_dan["abstract"])] data_info[lable_discipline_types[leibie]] = [[data_dan["author"], data_dan["title"], data_dan["special_topic"], data_dan["qikan_name"], data_dan["year"], data_dan["abstract"], "期刊"]] else: data_prompt[lable_discipline_types[leibie]].append("标题:“{}”,摘要:“{}”".format(data_dan["title"], data_dan["abstract"])) data_info[lable_discipline_types[leibie]].append([data_dan["author"], data_dan["title"], data_dan["special_topic"], data_dan["qikan_name"], data_dan["year"], data_dan["abstract"], "期刊"]) a += 1 print(2) strat = 0 end = 10000 print(len(data_prompt)) for leibie in tqdm(data_prompt): data_ndarray = np.empty((0, 768)) print("len(data_prompt[leibie])", len(data_prompt[leibie])) while True: if end >= len(data_prompt[leibie]): break linshi_data = data_prompt[leibie][strat:end] data_ndarray = np.concatenate((data_ndarray, shengcehng_array(linshi_data))) print("data_ndarray.shape", data_ndarray.shape) strat = end end += 10000 linshi_data = data_prompt[leibie][strat:len(data_prompt[leibie])] print("len(linshi_data)", len(linshi_data)) data_ndarray = np.concatenate((data_ndarray, shengcehng_array(linshi_data))) print("data_ndarray.shape", data_ndarray.shape) np.save(f'data/prompt_qikan/{leibie}.npy', data_ndarray) strat = 0 end = 10000 for leibie in data_info: print(len(data_info[leibie])) with open(f"data/data_info_qikan/{leibie}.json", "w", encoding="utf-8") as f: f.write(json.dumps(data_info[leibie], ensure_ascii=False, indent=2))