参考文献生成项目,使用faiss实现
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

66 lines
1.9 KiB

import json
# json.load()
# with open("t_xuewei_cnki_spider.csv", encoding="utf-8") as f:
# a = f.read()
# print(a)
import pandas as pd
filename = 'data/spider_latest_journal_paper_list.csv'
chunksize = 10000 # 指定每次读取的行数,可以根据需要调整
df_list = []
# 使用 chunksize 参数迭代读取 CSV 文件
for chunk in pd.read_csv(filename, chunksize=chunksize):
# 作者 论文名称 论文类别 论文来源 论文年份 摘要
# 对每个 chunk 进行处理
# print(chunk.columns)
# 9 / 0
df_list_dan = chunk.values.tolist()
# print(df_list[0])
for i in range(len(df_list_dan)):
df_list.append({
'author': df_list_dan[i][2],
'title': df_list_dan[i][1],
'special_topic': df_list_dan[i][7],
'qikan_name': df_list_dan[i][3],
'year': df_list_dan[i][4],
'abstract': df_list_dan[i][10],
})
# data = []
# json_list = [
# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki.json",
# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki2.json",
# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki3.json",
# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki6.json",
# "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki7.json",
# ]
#
#
# print("主库数据完成加载")
# for path in json_list:
# name, typr_file = path.split(".")
# name = name.split("/")[-1]
# a = json.load(open(path))
# for i in a:
# autoid = "_".join([name, str(i['autoid'])])
# if autoid in df_dict:
# data.append([i['f_title']] + df_dict[autoid])
# print("path完成筛选")
#
#
with open("data/data_0416.json", "w") as f:
f.write(json.dumps(df_list, ensure_ascii=False, indent=2))
#
# with open("data.json", encoding="utf-8") as f:
# for i in f.readlines():
# a = json.loads(i)
#
#
# print(a)