import json # json.load() # with open("t_xuewei_cnki_spider.csv", encoding="utf-8") as f: # a = f.read() # print(a) import pandas as pd filename = 'data/spider_latest_journal_paper_list.csv' chunksize = 10000 # 指定每次读取的行数,可以根据需要调整 df_list = [] # 使用 chunksize 参数迭代读取 CSV 文件 for chunk in pd.read_csv(filename, chunksize=chunksize): # 作者 论文名称 论文类别 论文来源 论文年份 摘要 # 对每个 chunk 进行处理 # print(chunk.columns) # 9 / 0 df_list_dan = chunk.values.tolist() # print(df_list[0]) for i in range(len(df_list_dan)): df_list.append({ 'author': df_list_dan[i][2], 'title': df_list_dan[i][1], 'special_topic': df_list_dan[i][7], 'qikan_name': df_list_dan[i][3], 'year': df_list_dan[i][4], 'abstract': df_list_dan[i][10], }) # data = [] # json_list = [ # "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki.json", # "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki2.json", # "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki3.json", # "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki6.json", # "/home/majiahui/project/爬取目录筛选/t_xuewei_detail_cnki7.json", # ] # # # print("主库数据完成加载") # for path in json_list: # name, typr_file = path.split(".") # name = name.split("/")[-1] # a = json.load(open(path)) # for i in a: # autoid = "_".join([name, str(i['autoid'])]) # if autoid in df_dict: # data.append([i['f_title']] + df_dict[autoid]) # print("path完成筛选") # # with open("data/data_0423_qikan.json", "w") as f: f.write(json.dumps(df_list, ensure_ascii=False, indent=2)) # # with open("data.json", encoding="utf-8") as f: # for i in f.readlines(): # a = json.loads(i) # # # print(a)