import requests
import re

response = requests.get('http://python.org/jobs')
response.raise_for_status()  

p = re.compile('<a href="(/jobs/\\d+)/">(.*?)</a>')
for url, name in p.findall(response.text):
    print(f"{name} (https://python.org{url})")

Research Engineer (https://python.org/jobs/7854)
Backend Python Developer (Django/DRF) (https://python.org/jobs/7853)
CTO (Python Expert &amp; Technical Leader) – Equity/Stock Options Available (https://python.org/jobs/7852)
Senior Python SDR Software Engineer (https://python.org/jobs/7851)
Senior Financial Analyst (https://python.org/jobs/7849)
Senior Software Engineer (LATAM) (https://python.org/jobs/7848)
Scientific Software Engineer (https://python.org/jobs/7847)
Senior Scientific Software Engineer (https://python.org/jobs/7846)
Core Tech AI Engineer at CodeFlash.ai (https://python.org/jobs/7844)
Python Software Engineer (Docker required) (https://python.org/jobs/7843)
Senior Data Analyst (https://python.org/jobs/7842)
Senior Python Engineer (https://python.org/jobs/7841)
Senior Full Stack Web Developer (Python/Django + CMS) (https://python.org/jobs/7840)
Python Software Engineer (https://python.org/jobs/7836)
Senior Platform Engineer (https://python.org/jobs/7835)
Drone Python Programmer/Software Engineer (https://python.org/jobs/7834)
Senior Back-End Engineer (Python) (https://python.org/jobs/7832)
Senior Software Engineer (https://python.org/jobs/7830)
Senior Python Full Stack Developer in Canada. (100% Remote) (https://python.org/jobs/7829)
Python React Developer (https://python.org/jobs/7828)
Full Stack Engineer (Django) (https://python.org/jobs/7827)
Python/C++ Developer (https://python.org/jobs/7826)
Senior Python Backend Developer with Blockchain Experience (https://python.org/jobs/7824)
Python/MATLAB Programmer (https://python.org/jobs/7823)
Senior Python Backend Engineer (https://python.org/jobs/7821)

# pip install pytidylib

from html.parser import HTMLParser
import urllib.request

# 自定义HTML解析器类
class MyHTMLParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.links = []
        self.current_data = ""
        self.recording = False
        self.target_class = "some-class"  # 设置要查找的class名

    def handle_starttag(self, tag, attrs):
        # 处理链接
        if tag == 'a':
            for (attr, value) in attrs:
                if attr == 'href':
                    self.links.append(value)

        # 处理特定class的内容
        if tag == 'div':
            for (attr, value) in attrs:
                if attr == 'class' and value == self.target_class:
                    self.recording = True

    def handle_data(self, data):
        if self.recording:
            self.current_data += data

    def handle_endtag(self, tag):
        if tag == 'div' and self.recording:
            print("找到内容:", self.current_data.strip())
            self.current_data = ""
            self.recording = False

# 获取网页内容
url = "https://example.com"  # 替换为目标网址
response = urllib.request.urlopen(url)
html_content = response.read().decode('utf-8')

# 创建解析器并解析内容
parser = MyHTMLParser()
parser.feed(html_content)

# 输出所有链接
print("\n网页中的所有链接:")
for link in parser.links:
    print(link)

网页中的所有链接:
https://www.iana.org/domains/example

class AdvancedParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.inside_target = False
        self.depth = 0
        self.result = []

    def handle_starttag(self, tag, attrs):
        if tag == 'div' and ('id', 'content') in attrs:
            self.inside_target = True
        if self.inside_target:
            self.depth += 1

    def handle_endtag(self, tag):
        if self.inside_target:
            self.depth -= 1
            if self.depth == 0:
                self.inside_target = False

    def handle_data(self, data):
        if self.inside_target:
            self.result.append(data.strip())

# 使用示例
parser = AdvancedParser()
parser.feed(html_content)
print("提取的内容:", ' '.join(parser.result))

提取的内容:

from bs4 import BeautifulSoup
import requests

# 1. 获取网页内容
url = "https://example.com"  # 替换为你想抓取的网址
response = requests.get(url)
html_content = response.text

# 2. 创建BeautifulSoup对象
soup = BeautifulSoup(html_content, 'html.parser')

# 3. 提取数据示例
# 提取所有链接
print("网页中的所有链接:")
for link in soup.find_all('a'):
    print(link.get('href'))

# 提取标题
title = soup.title.string
print(f"\n网页标题: {title}")

# 提取特定class的内容
print("\n特定class的内容:")
for item in soup.find_all(class_='some-class'):  # 替换'some-class'为实际class名
    print(item.text.strip())

# 4. 更复杂的提取示例
# 提取表格数据
print("\n表格数据:")
for table in soup.find_all('table'):
    for row in table.find_all('tr'):
        cells = [cell.text.strip() for cell in row.find_all('td')]
        print(cells)

网页中的所有链接:
https://www.iana.org/domains/example

网页标题: Example Domain

特定class的内容:

表格数据:

回调方法	何时被调用
`handle_starttag(tag, attrs)`	遇到开始标签时调用。`attrs` 是一个由形如(name, value)的元组组成的序列
`handle_startendtag(tag, attrs)`	遇到空标签时调用。默认分别处理开始标签和结束标签
`handle_endtag(tag)`	遇到结束标签时调用
`handle_data(data)`	遇到文本数据时调用
`handle_charref(ref)`	遇到形如 `&#ref` ;的字符引用时调用
`handle_entityref(name)`	遇到形如 `&name` ;的实体引用时调用
`handle_comment(data)`	遇到注释时；只对注释内容调用
`handle_decl(decl)`	遇到形如`<!...>`的声明时调用
`handle_pi(data)`	用于处理指令
`unknown_decl(data)`	遇到未知声明时调用

简单的网页抓取程序

Tidy 和 XHTML 解析

Tidy

获取Tidy

为何使用XHTML

使用HTMLParser

`HTMLParser` 的网页抓取程序

Beautiful Soup

网页抓取程序

① 阅读使用手册

② 注册用户账号

介绍

平台内核

注意事项

简单的网页抓取程序

Tidy 和 XHTML 解析

Tidy

获取Tidy

为何使用XHTML

使用HTMLParser

HTMLParser 的网页抓取程序

Beautiful Soup

网页抓取程序

① 阅读使用手册

② 注册用户账号

③ 登陆

Python基础

Python进阶

标准类库

专题工具

图像处理

科学计算

自然语言

开源GIS

R 编程语言

Julia编程语言

介绍

平台内核

注意事项

`HTMLParser` 的网页抓取程序