from pypdf import PdfReader

with open('/data/demo/minimal.pdf','rb') as f:
    pdf=PdfReader(f)
    information=pdf.metadata
    number_of_pages=len(pdf.pages)

txt=f"""Author: {information.author}
Creator: {information.creator}
Producer: {information.producer}
Subject: {information.subject}
Title: {information.title}
Number of pages: {number_of_pages}"""
print(txt)

Author: None
Creator: cairo 1.11.2 (http://cairographics.org)
Producer: cairo 1.11.2 (http://cairographics.org)
Subject: None
Title: None
Number of pages: 1

pdffile = open('/data/demo/postgis-essential-0423.pdf','rb')  
pdfreader = PdfReader(pdffile) 
print(len(pdfreader.pages))

54

page = pdfreader.pages[0]
print(page.extract_text().strip())

第1 章 地理空间数据库的发展、技术与标准
回答地理空间问题有很多工具，或桌面应用系统。这种方法虽然功能完备，但不能同时回
答许多问题。此外，这种方法通常无法在一个数据集中有效地管理和操作大量的空间数据集，
也无法使任务自动化。
一旦需要可伸缩性、对大型数据集的支持以及直接输入机制，大多数用户就会使用空间数
据库进行探索。有几个可用的空间数据库软件，一些是专有的，另一些是开源的。PostGIS 是
一个开源的空间数据库软件，可能是所有空间数据库软件中最容易访问的。
PostGIS 作为扩展运行，为 PostgreSQL 数据库提供空间功能。在这种能力下，PostGIS
允许将空间数据与常规关系型数据一起包含进来。通过构建 PostGIS 提供的核心功能和
PostgreSQL 固有的可扩展性，可以实现新的或增强的功能。
在数据库存储方面，数据库是高级形式，而 PostGIS 赋予其更多的功能。
1.1 平面文件、空间数据引擎到空间数据库
在传统的第一代 地理信息系统（GIS）实现中，所有的空间数据都存储在 平面文件（flat
files ）中，需要专门的 GIS 软件来解释和操作这些数据。这些第一代管理系统旨在满足用
户的需求，其中所有所需的数据都在用户的组织领域中。它们是专为处理 空间数据而构建的专
有的、独立的系统，应用程序和平面文件之间的耦合性非常高，平面文件里的空间数据没有数
据独立性。
为了提高数据库管理系统（DBMS）对空间数据的管理能力, 国内外较为流行的主要集中在
“关系型数据库 + 空间数据引擎” 、 “扩展对象关系型数据库”两方面。
“关系型数据库＋空间数据引擎”技术方案访问迅速，与 GIS 联系紧密，在应用中占有一
定优势。问题是引擎与数据库内核独立，难以利用数据库系统中已有的成熟的管理、访问技术，
在进一步发展上有致命弱点。
“扩展对象空间数据库系统” 技术方案从理论上来看，是最适用于空间数据的表达和管理
的。
3

idx_arr = [0]
for idx, page in enumerate(pdfreader.pages):
    cnts = page.extract_text().strip().splitlines()    
    # print(idx, cnts[0])
    if '第' in cnts[0] and '章' in cnts[0]:
        print(idx)
        print(cnts[0])
        if idx != 0:
            idx_arr.append(idx)
idx_arr.append(len(pdfreader.pages))

print(idx_arr)

0
第1 章 地理空间数据库的发展、技术与标准
20
第16 章 管理栅格数据
32
第17 章 高级话题
42
第18 章 运维
[0, 20, 32, 42, 54]

from pypdf import PdfWriter

pdf_idx = 1

from pathlib import Path
outws = Path('xx_post')
if outws.exists():
    pass
else:
    outws.mkdir()

for qq, hh in zip(idx_arr[:-1], idx_arr[1:]):
    outfile = outws / f'xx_{pdf_idx:02}.pdf'
    print(outfile)


    merger = PdfWriter()


    # add the first 3 pages of input1 document to output
    merger.append(fileobj=pdfreader, pages=(qq, hh))

    # insert the first page of input2 into the output beginning after the second page
    # merger.merge(position=2, fileobj=input2, pages=(0, 1))

    # append entire input3 document to the end of the output document
    # merger.append(input3)

    # Write to an output PDF document
    output = open(outfile, "wb")
    merger.write(output)

    # Close File Descriptors
    merger.close()
    output.close()
    pdf_idx =  pdf_idx + 1

xx_post/xx_01.pdf
xx_post/xx_02.pdf
xx_post/xx_03.pdf
xx_post/xx_04.pdf

pdffile = open('/data/demo/servers.pdf','rb')  
pdfreader.is_encrypted

False

pg = pdfreader.pages[0]
pg.keys()

dict_keys(['/Resources', '/Type', '/Parent', '/Contents', '/MediaBox'])

pg.values()

dict_values([IndirectObject(322, 0, 139795583271520), '/Page', IndirectObject(1646, 0, 139795583271520), [IndirectObject(321, 0, 139795583271520)], [0, 0, 595.28, 841.89]])

# im8 = pg['/Resources']['/XObject']['/FXX1'].get_data()
# with open('im.png', 'wb') as f:
#      f.write(im8)
# from matplotlib import pyplot as plt
# from PIL import Image
# img=Image.open('./im.png')
# plt.imshow(img)
# plt.show()

查看PDF信息

读取文本

读取图片

① 阅读使用手册

② 注册用户账号

介绍

平台内核

注意事项

查看PDF信息

读取文本

读取图片

① 阅读使用手册

② 注册用户账号

③ 登陆

Python基础

Python进阶

标准类库

专题工具

图像处理

科学计算

自然语言

开源GIS

R与Julia

介绍

平台内核

注意事项