bs4

基本概念

Beautiful Soup 是⼀个可以从HTML或XML⽂件中提取数据的⽹⻚信息提取库

快速开始

bs4对象种类

tag : 标签
NavigableString : 可导航的字符串
BeautifulSoup : bs对象 父类为Tag
Comment : 注释

获取bs4对象并规范化打印

from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""
#获取bs对象
soup =BeautifulSoup(html_doc,'lxml')
#规范化打印
print(soup.prettify()) #代码不做赘述

获取title标签内容、获取title标签名称、 title标签文本内容、获取p段落

from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""
#获取bs对象
soup =BeautifulSoup(html_doc,'lxml')   
#获取title标签内容
print(soup.title,type(soup.title))      #<title>The Dormouse's story</title> <class 'bs4.element.Tag'>
#获取title标签内容名称
print(soup.title.name,type(soup.title.name))   #title <class 'str'>
#获取title标签文本内容
print(soup.title.string,type(soup.title.string))  #The Dormouse's story <class 'bs4.element.NavigableString'>
#获取p段落
print(soup.p,type(soup.p))  #<p class="title"><b>The Dormouse's story</b></p> <class 'bs4.element.Tag'>

遍历文档树遍历子节点

bs⾥⾯有三种情况，第⼀个是遍历，第⼆个是查找，第三个是修改
contents 返回的是⼀个所有⼦节点的列表
children 返回的是⼀个⼦节点的迭代器通
descendants 返回的是⼀个⽣成器遍历⼦⼦孙孙

from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc,'lxml')
tag = soup.head
print(tag.contents)  #找到head 子节点 返回值是列表 [<title>The Dormouse's story</title>]
print(tag.children,type(tag.children))  #生成子节点迭代器 <list_iterator object at 0x000001CB3D709470> <class 'list_iterator'>
for i in tag.children:
    print(i)  #<title>The Dormouse's story</title>
for s in soup.descendants:
    print(s)  #代码冗长 不再赘述

获取内容

string获取标签⾥⾯的内容
strings 返回是⼀个⽣成器对象⽤过来获取多个标签内容
stripped strings 和strings基本⼀致但是它可以把多余的空格去掉

from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc,'lxml')
print(soup.title.string)  #The Dormouse's story
print(soup.strings)    # 有很多空格
for c in soup.stripped_strings: # 去空格
    print(c)

遍历父节点

parent直接获得⽗节点
parents获取所有的⽗节点

from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc,'lxml')
print(soup.title.parent)   #<head><title>The Dormouse's story</title></head>
#print(soup.html.parent)   #html父节点为整个文档
print(soup.p.parents)      #<generator object parents at 0x000002CF36CADE60>

遍历兄弟节点

next_sibling 下⼀个兄弟结点
previous_sibling 上⼀个兄弟结点
next_siblings 下⼀个所有兄弟结点
previous_siblings上⼀个所有兄弟结点

from bs4 import BeautifulSoup
html = '<a><b>bbb</b><c>ccc</c></a>'
soup = BeautifulSoup(html,'lxml')
print(soup.b.next_sibling)     #<c>ccc</c>
print(soup.c.previous_sibling) #<b>bbb</b>

搜索树

字符串过滤器
正则表达式过滤器
- 我们⽤正则表达式⾥⾯compile⽅法编译⼀个正则表达式传给 find 或者findall这个⽅法可以实现⼀个正则表达式的⼀个过滤器的搜索
列表过滤器
True过滤器
方法过滤器

from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""
#搜索树
# 字符串过滤器
soup = BeautifulSoup(html_doc,'lxml')
a_tags = soup.find('a')  #a 既是字符串过滤器
print(a_tags)
# 正则表达式过滤器 我们⽤正则表达式⾥⾯compile⽅法编译⼀个正则表达式传给 find 或者 findall这个⽅法可以实现⼀个正则表达式的⼀个过滤器的搜索
# 列表过滤器
print(soup.find_all(['p','a']))
print('----------')
print(soup.find_all(['title','b']))
# True过滤器
# ⽅法过滤器

find_all() 和 find()

find_all()⽅法以列表形式返回所有的搜索到的标签数据
find()⽅法返回搜索到的第⼀条数据
find_all()⽅法参数

1	def find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs):

name:tag标签
attr：标签的属性
recursive：是否递归搜索
text：文本内容
limit：限制返回条数
kwargs：关键字参数

from bs4 import BeautifulSoup
html = """
<table class="tablelist" cellpadding="0" cellspacing="0">
    <tbody>
        <tr class="h">
            <td class="l" width="374">职位名称</td>
            <td>职位类别</td>
            <td>人数</td>
            <td>地点</td>
            <td>发布时间</td>
        </tr>
        <tr class="even">
            <td class="l square"><a target="_blank" href="position_detail.php?id=33824&keywords=python&tid=87&lid=2218">22989-金融云区块链高级研发工程师（深圳）</a></td>
            <td>技术类</td>
            <td>1</td>
            <td>深圳</td>
            <td>2017-11-25</td>
        </tr>
        <tr class="odd">
            <td class="l square"><a target="_blank" href="position_detail.php?id=29938&keywords=python&tid=87&lid=2218">22989-金融云高级后台开发</a></td>
            <td>技术类</td>
            <td>2</td>
            <td>深圳</td>
            <td>2017-11-25</td>
        </tr>
        <tr class="even">
            <td class="l square"><a target="_blank" href="position_detail.php?id=31236&keywords=python&tid=87&lid=2218">SNG16-腾讯音乐运营开发工程师（深圳）</a></td>
            <td>技术类</td>
            <td>2</td>
            <td>深圳</td>
            <td>2017-11-25</td>
        </tr>
        <tr class="odd">
            <td class="l square"><a target="_blank" href="position_detail.php?id=31235&keywords=python&tid=87&lid=2218">SNG16-腾讯音乐业务运维工程师（深圳）</a></td>
            <td>技术类</td>
            <td>1</td>
            <td>深圳</td>
            <td>2017-11-25</td>
        </tr>
        <tr class="even">
            <td class="l square"><a target="_blank" href="position_detail.php?id=34531&keywords=python&tid=87&lid=2218">TEG03-高级研发工程师（深圳）</a></td>
            <td>技术类</td>
            <td>1</td>
            <td>深圳</td>
            <td>2017-11-24</td>
        </tr>
        <tr class="odd">
            <td class="l square"><a target="_blank" href="position_detail.php?id=34532&keywords=python&tid=87&lid=2218">TEG03-高级图像算法研发工程师（深圳）</a></td>
            <td>技术类</td>
            <td>1</td>
            <td>深圳</td>
            <td>2017-11-24</td>
        </tr>
        <tr class="even">
            <td class="l square"><a target="_blank" href="position_detail.php?id=31648&keywords=python&tid=87&lid=2218">TEG11-高级AI开发工程师（深圳）</a></td>
            <td>技术类</td>
            <td>4</td>
            <td>深圳</td>
            <td>2017-11-24</td>
        </tr>
        <tr class="odd">
            <td class="l square"><a target="_blank" href="position_detail.php?id=32218&keywords=python&tid=87&lid=2218">15851-后台开发工程师</a></td>
            <td>技术类</td>
            <td>1</td>
            <td>深圳</td>
            <td>2017-11-24</td>
        </tr>
        <tr class="even">
            <td class="l square"><a target="_blank" href="position_detail.php?id=32217&keywords=python&tid=87&lid=2218">15851-后台开发工程师</a></td>
            <td>技术类</td>
            <td>1</td>
            <td>深圳</td>
            <td>2017-11-24</td>
        </tr>
        <tr class="odd">
            <td class="l square"><a id="test" class="test" target='_blank' href="position_detail.php?id=34511&keywords=python&tid=87&lid=2218">SNG11-高级业务运维工程师（深圳）</a></td>
            <td>技术类</td>
            <td>1</td>
            <td>深圳</td>
            <td>2017-11-24</td>
        </tr>
    </tbody>
</table>
"""
soup = BeautifulSoup(html,'lxml')
#find 找到第一个标签并返回
tr = soup.find('tr')
print(tr)
#<tr class="h">
#<td class="l" width="374">职位名称</td>
#<td>职位类别</td>
#<td>人数</td>
#<td>地点</td>
#<td>发布时间</td>
#</tr>
#find_all找到所有标签并返回列表
trs = soup.find_all('tr')
for t in trs:
    print(t)                 #代码冗长 不做赘述
    print('------------')
获取第二个tr标签
trs = soup.find_all('tr',limit=2)[1]   #不加[1]返回 前二列表
print(trs)                   #代码冗长 不做赘述
#获取class=even的tr标签
even = soup.find_all('tr',class_='even')
for tr in even:
    print(tr)          #代码冗长 不做赘述
even = soup.find_all('tr',attrs={'class':"even"})  #attrs一样效果
for tr in even:
    print(tr)          #代码冗长 不做赘述
将所有id等于test class=test的a标签提取
r = soup.find_all('a',id = 'test',class_='test')
for tr in r:
    print(tr)          #代码冗长 不做赘述
#将a标签中所有的href属性
a = soup.find_all('a')
for i in a:
    href = i.attrs['href']  #i['href'] 可以达到结果
    print(href)          #代码冗长 不做赘述
#获取所有职位信息 文本数据
trs = soup.find_all('tr')[1:]
for tr in trs:
    tds = tr.find_all('td')
    job = tds[0].string
    print(job)    
#22989-金融云区块链高级研发工程师（深圳）
#22989-金融云高级后台开发
#SNG16-腾讯音乐运营开发工程师（深圳）
#SNG16-腾讯音乐业务运维工程师（深圳）
#TEG03-高级研发工程师（深圳）
#TEG03-高级图像算法研发工程师（深圳）
#TEG11-高级AI开发工程师（深圳）
#15851-后台开发工程师
#15851-后台开发工程师
#SNG11-高级业务运维工程师（深圳）