pip install python-docx

Requirement already satisfied: python-docx in /opt/conda/lib/python3.12/site-packages (1.1.2)
Requirement already satisfied: lxml>=3.1.0 in /opt/conda/lib/python3.12/site-packages (from python-docx) (5.3.1)
Requirement already satisfied: typing_extensions>=4.9.0 in /opt/conda/lib/python3.12/site-packages (from python-docx) (4.12.2)
Note: you may need to restart the kernel to use updated packages.

import docx

from docx import Document
dfile = Document('/data/demo/demo.docx')

core_properties = dfile.core_properties
for idx, uu in enumerate(dir(core_properties)[27:]):
    print(idx, uu)

0 _element
1 author
2 category
3 comments
4 content_status
5 created
6 identifier
7 keywords
8 language
9 last_modified_by
10 last_printed
11 modified
12 revision
13 subject
14 title
15 version

secs = dfile.sections

for sec in secs:
    print(sec.bottom_margin)

914400

type(dfile.paragraphs)

list

for idx, para in enumerate(dfile.paragraphs):
    print(idx, para.text)

0 从百草园到三味书屋
1 
2 我家的后面有一个很大的园，相传叫作百草园。现在是早已并屋子一起卖给朱文公的子孙了，连那最末次的相见也已经隔了七八年，其中似乎确凿只有一些野草；但那时却是我的乐园。
3 
4 这是鲁迅的母校：三味书屋  
5 
6 不必说碧绿的菜畦，光滑的石井栏，高大的皂荚树，紫红的桑椹；也不必说鸣蝉在树叶里长吟，肥胖的黄蜂伏在菜花上，轻捷的叫天子（云雀）忽然从草间直窜向云霄里去了。
单是周围的短短的泥墙根一带，就有无限趣味。
7 
8

dfile.paragraphs[4].text

'这是鲁迅的母校：三味书屋\t'

file_tb=Document('/data/demo/tables.docx')
tb=file_tb.tables
type(tb)

list

type(tb[0].rows)

docx.table._Rows

for r in tb[0].rows:
    row_cnt = [cell.text for cell in r.cells]
    print(row_cnt)

['date', 'type', '1001A']
['20200420', 'AQI', '21']
['20200420', 'PM2.5', '6']
['20200420', 'PM2.5_24h', '35']
['20200420', 'PM10', '11']
['20200420', 'PM10_24h', '53']
['20200420', 'SO2', '3']

for r in tb[1].rows:
    row_cnt = [cell.text for cell in r.cells]
    print(row_cnt)

['date', 'type', '1001A']
['20200420', 'AQI', '21']
['20200420', 'PM2.5', '6']
['20200420', 'PM2.5_24h', '35']
['20200420\n20200420\n20200420', 'PM10', '11']
['20200420\n20200420\n20200420', 'PM10_24h', '53']
['20200420\n20200420\n20200420', 'SO2', '3']

len(tb[1].rows), len(tb[1].columns)

(7, 3)

for r in tb[0].rows:
    row_cnt = [cell.paragraphs[0].text for cell in r.cells]
    print(row_cnt)

['date', 'type', '1001A']
['20200420', 'AQI', '21']
['20200420', 'PM2.5', '6']
['20200420', 'PM2.5_24h', '35']
['20200420', 'PM10', '11']
['20200420', 'PM10_24h', '53']
['20200420', 'SO2', '3']

from docx.enum.style import WD_STYLE_TYPE
document=Document('/data/demo/demo1.docx')
styles = document.styles

paragraph_styles = [s for s in styles if s.type == WD_STYLE_TYPE.PARAGRAPH]

for style in paragraph_styles[:5]:
    print(style.name , end = '; ' )

Normal; Heading 1; Heading 2; Heading 3; Heading 4;

len(document.paragraphs[1].runs)

2

document.paragraphs[1].runs[0].text

'在上个段落前在插入一个段落。'

document.paragraphs[1].runs[1].text

'这里格式进行了改变。'

打开文档

获取段落对象

读取表格对象

获取文档中的样式名称

块对象与行内对象

① 阅读使用手册

② 注册用户账号

介绍

平台内核

注意事项

打开文档

获取段落对象

读取表格对象

获取文档中的样式名称

块对象与行内对象

① 阅读使用手册

② 注册用户账号

③ 登陆

Python基础

Python进阶

标准类库

专题工具

图像处理

科学计算

自然语言

开源GIS

R与Julia

介绍

平台内核

注意事项