You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

48 lines
1.4 KiB

# -*- coding: utf-8 -*-
"""
@Time : 2023/3/9 15:34
@Author :
@FileName:
@Software:
@Describe:
"""
import pdfplumber
import pandas as pd
path = "./data/新建文件夹/13977991/全文对照.pdf"
# with pdfplumber.open(path) as pdf:
# first_page = pdf.pages[0]
# # 获取文本,直接得到字符串,包括了换行符【与PDF上的换行位置一致,而不是实际的“段落”】
# print(first_page.extract_texts())
# # 获取本页全部表格,也可以使用extract_table()获得单个表格
# for table in p0.extract_tables():
# #得到的table是嵌套list类型,转化成DataFrame更加方便查看和分析
# df = pd.DataFrame(table[1:], columns=table[0])
# print(df)
with pdfplumber.open(path) as pdf:
content = ''
for i in range(len(pdf.pages)):
# 读取PDF文档第i+1页
page = pdf.pages[i]
# page.extract_text()函数即读取文本内容,下面这步是去掉文档最下面的页码
page_content = '\n'.join(page.extract_text().split('\n')[:-1])
content = content + page_content
print(content)
import pdfplumber
import pandas as pd
with pdfplumber.open(path) as pdf:
first_page = pdf.pages[3]
tables = first_page.extract_tables()
for table in tables:
df = pd.DataFrame(table)
# 第一列当成表头:
# df = pd.DataFrame(table[1:], columns=table[0])
print(df)