import os os.environ["CUDA_VISIBLE_DEVICES"] = "1" import json import numpy as np from tqdm import tqdm from sentence_transformers import SentenceTransformer import re model = SentenceTransformer('Dmeta-embedding-zh') print(1) with open("data/discipline_types.json", encoding="utf-8") as f: lable_discipline_types = json.loads(f.read()) def erjimul_ulit(): pass def shengcehng_array(data): embs = model.encode(data, normalize_embeddings=True) return embs def is_contain_chinese(word): """ 判断字符串是否包含中文字符 :param word: 字符串 :return: 布尔值,True表示包含中文,False表示不包含中文 """ pattern = re.compile(r'[\u4e00-\u9fa5]') match = pattern.search(word) return True if match else False if __name__ == '__main__': # data = [] with open("data/data_0423_doctor.json", encoding="utf-8") as f: # for i in f.readlines(): # a = json.loads(i) # data.append(a) data = json.loads(f.read()) print(len(data)) a = 0 a_ = 0 data_info = {} # 作者 论文名称 论文类别 论文来源 论文年份 摘要 data_prompt = {} data_info_en = {} # 作者 论文名称 论文类别 论文来源 论文年份 摘要 data_prompt_en = {} for data_dan in data: if str(data_dan["special_topic"]) == "nan" or \ str(data_dan["author"]) == "nan" or \ str(data_dan["title"]) == "nan" or \ str(data_dan["qikan_name"]) == "nan" or \ str(data_dan["year"]) == "nan" or \ str(data_dan["abstract"]) == "nan": a_ += 1 continue leibie_list = data_dan["special_topic"].split(";") for leibie in leibie_list: if leibie in lable_discipline_types: zh_bool = is_contain_chinese(data_dan["title"]) if zh_bool == True: if lable_discipline_types[leibie] not in data_prompt: dan_data_prompt = "标题:“{}”,摘要:“{}”".format(data_dan["title"], data_dan["abstract"]) data_prompt[lable_discipline_types[leibie]] = [dan_data_prompt] data_info[lable_discipline_types[leibie]] = [ [data_dan["author"], data_dan["title"], data_dan["special_topic"], data_dan["qikan_name"], data_dan["year"], data_dan["abstract"], "博士"]] else: dan_data_prompt = "标题:“{}”,摘要:“{}”".format(data_dan["title"], data_dan["abstract"]) data_prompt[lable_discipline_types[leibie]].append(dan_data_prompt) data_info[lable_discipline_types[leibie]].append( [data_dan["author"], data_dan["title"], data_dan["special_topic"], data_dan["qikan_name"], data_dan["year"], data_dan["abstract"], "博士"]) else: if lable_discipline_types[leibie] not in data_prompt_en: dan_data_prompt = "标题:“{}”,摘要:“{}”".format(data_dan["title"], data_dan["abstract"]) data_prompt_en[lable_discipline_types[leibie]] = [dan_data_prompt] data_info_en[lable_discipline_types[leibie]] = [ [data_dan["author"], data_dan["title"], data_dan["special_topic"], data_dan["qikan_name"], data_dan["year"], data_dan["abstract"], "博士"]] else: dan_data_prompt = "标题:“{}”,摘要:“{}”".format(data_dan["title"], data_dan["abstract"]) data_prompt_en[lable_discipline_types[leibie]].append(dan_data_prompt) data_info_en[lable_discipline_types[leibie]].append( [data_dan["author"], data_dan["title"], data_dan["special_topic"], data_dan["qikan_name"], data_dan["year"], data_dan["abstract"], "博士"]) a += 1 print(2) strat = 0 end = 10000 print(len(data_prompt)) for leibie in tqdm(data_prompt): data_ndarray = np.empty((0, 768)) print("len(data_prompt[leibie])", len(data_prompt[leibie])) while True: if end >= len(data_prompt[leibie]): break linshi_data = data_prompt[leibie][strat:end] data_ndarray = np.concatenate((data_ndarray, shengcehng_array(linshi_data))) print("data_ndarray.shape", data_ndarray.shape) strat = end end += 10000 linshi_data = data_prompt[leibie][strat:len(data_prompt[leibie])] print("len(linshi_data)", len(linshi_data)) data_ndarray = np.concatenate((data_ndarray, shengcehng_array(linshi_data))) print("data_ndarray.shape", data_ndarray.shape) np.save(f'data/prompt_doctor/{leibie}.npy', data_ndarray) strat = 0 end = 10000 for leibie in data_info: print(len(data_info[leibie])) with open(f"data/data_info_doctor/{leibie}.json", "w", encoding="utf-8") as f: f.write(json.dumps(data_info[leibie], ensure_ascii=False, indent=2)) for i in data_prompt_en: print(i) print(len(data_prompt_en[i])) print(len(data)) print(a_)