from bs4 import BeautifulSoup html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ #获取bs对象 soup =BeautifulSoup(html_doc,'lxml') #规范化打印 print(soup.prettify()) #代码不做赘述
from bs4 import BeautifulSoup html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ #获取bs对象 soup =BeautifulSoup(html_doc,'lxml') #获取title标签内容 print(soup.title,type(soup.title)) #<title>The Dormouse's story</title> <class 'bs4.element.Tag'> #获取title标签内容名称 print(soup.title.name,type(soup.title.name)) #title <class 'str'> #获取title标签文本内容 print(soup.title.string,type(soup.title.string)) #The Dormouse's story <class 'bs4.element.NavigableString'> #获取p段落 print(soup.p,type(soup.p)) #<p class="title"><b>The Dormouse's story</b></p> <class 'bs4.element.Tag'>
from bs4 import BeautifulSoup html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ soup = BeautifulSoup(html_doc,'lxml') tag = soup.head print(tag.contents) #找到head 子节点 返回值是列表 [<title>The Dormouse's story</title>] print(tag.children,type(tag.children)) #生成子节点迭代器 <list_iterator object at 0x000001CB3D709470> <class 'list_iterator'> for i in tag.children: print(i) #<title>The Dormouse's story</title> for s in soup.descendants: print(s) #代码冗长 不再赘述
获取内容
string获取标签⾥⾯的内容
strings 返回是⼀个⽣成器对象⽤过来获取多个标签内容
stripped strings 和strings基本⼀致 但是它可以把多余的空格去掉
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
from bs4 import BeautifulSoup html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ soup = BeautifulSoup(html_doc,'lxml') print(soup.title.string) #The Dormouse's story print(soup.strings) # 有很多空格 for c in soup.stripped_strings: # 去空格 print(c)
遍历父节点
parent直接获得⽗节点
parents获取所有的⽗节点
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
from bs4 import BeautifulSoup html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ soup = BeautifulSoup(html_doc,'lxml') print(soup.title.parent) #<head><title>The Dormouse's story</title></head> #print(soup.html.parent) #html父节点为整个文档 print(soup.p.parents) #<generator object parents at 0x000002CF36CADE60>
遍历兄弟节点
next_sibling 下⼀个兄弟结点
previous_sibling 上⼀个兄弟结点
next_siblings 下⼀个所有兄弟结点
previous_siblings上⼀个所有兄弟结点
1 2 3 4 5
from bs4 import BeautifulSoup html = '<a><b>bbb</b><c>ccc</c></a>' soup = BeautifulSoup(html,'lxml') print(soup.b.next_sibling) #<c>ccc</c> print(soup.c.previous_sibling) #<b>bbb</b>
from bs4 import BeautifulSoup html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ #搜索树 # 字符串过滤器 soup = BeautifulSoup(html_doc,'lxml') a_tags = soup.find('a') #a 既是字符串过滤器 print(a_tags) # 正则表达式过滤器 我们⽤正则表达式⾥⾯compile⽅法编译⼀个正则表达式传给 find 或者 findall这个⽅法可以实现⼀个正则表达式的⼀个过滤器的搜索 # 列表过滤器 print(soup.find_all(['p','a'])) print('----------') print(soup.find_all(['title','b'])) # True过滤器 # ⽅法过滤器