You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
48 lines
1.4 KiB
48 lines
1.4 KiB
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
@Time : 2023/3/9 15:34
|
|
@Author :
|
|
@FileName:
|
|
@Software:
|
|
@Describe:
|
|
"""
|
|
import pdfplumber
|
|
import pandas as pd
|
|
|
|
path = "./data/新建文件夹/13977991/全文对照.pdf"
|
|
# with pdfplumber.open(path) as pdf:
|
|
# first_page = pdf.pages[0]
|
|
# # 获取文本,直接得到字符串,包括了换行符【与PDF上的换行位置一致,而不是实际的“段落”】
|
|
# print(first_page.extract_texts())
|
|
# # 获取本页全部表格,也可以使用extract_table()获得单个表格
|
|
# for table in p0.extract_tables():
|
|
# #得到的table是嵌套list类型,转化成DataFrame更加方便查看和分析
|
|
# df = pd.DataFrame(table[1:], columns=table[0])
|
|
# print(df)
|
|
|
|
|
|
with pdfplumber.open(path) as pdf:
|
|
content = ''
|
|
for i in range(len(pdf.pages)):
|
|
# 读取PDF文档第i+1页
|
|
page = pdf.pages[i]
|
|
|
|
# page.extract_text()函数即读取文本内容,下面这步是去掉文档最下面的页码
|
|
page_content = '\n'.join(page.extract_text().split('\n')[:-1])
|
|
content = content + page_content
|
|
|
|
print(content)
|
|
|
|
import pdfplumber
|
|
import pandas as pd
|
|
|
|
with pdfplumber.open(path) as pdf:
|
|
first_page = pdf.pages[3]
|
|
tables = first_page.extract_tables()
|
|
for table in tables:
|
|
df = pd.DataFrame(table)
|
|
# 第一列当成表头:
|
|
# df = pd.DataFrame(table[1:], columns=table[0])
|
|
|
|
print(df)
|