Python beautiful soup - bs4

e2m 1,961 views 20 slides Dec 06, 2015
Slide 1
Slide 1 of 20
Slide 1
1
Slide 2
2
Slide 3
3
Slide 4
4
Slide 5
5
Slide 6
6
Slide 7
7
Slide 8
8
Slide 9
9
Slide 10
10
Slide 11
11
Slide 12
12
Slide 13
13
Slide 14
14
Slide 15
15
Slide 16
16
Slide 17
17
Slide 18
18
Slide 19
19
Slide 20
20

About This Presentation

Python CodeLabs - Python bs4 - Introduction to BeautifulSoup
http://eueung.github.io/python/bs4


Slide Content


Python - BeautifulSoup bs4
Eueung Mulyana
http://eueung.github.io/python/bs4
Python CodeLabs | Attribution-ShareAlike CC BY-SA
1 / 20

Agenda
bs4 Basics
Easy Web Scraping
2 / 20


Basics
3 / 20

Example #1
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.prettify())
<html>
<head>
<title>
The Dormouse's story
</title>
</head>
<body>
<p class="title">
<b>
The Dormouse's story
</b>
</p>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1"
<a class="sister" href="http://example.com/lacie" id="link2"
<a class="sister" href="http://example.com/tillie" id="link3"
and they lived at the bottom of a well.
</p>
<p class="story"> ... </p>
</body>
</html>
4 / 20

Example #2
for link in soup.find_all('a'):
print(link.get('href'))
print(soup.get_text())
print soup.title # <title>The Dormouse's story</title>
print soup.title.name # u'title'
print soup.title.string # u'The Dormouse's story'
print soup.title.parent.name # u'head'
print soup.p # <p class="title"><b>The Dormouse's story</b></p>
print soup.p['class'] # u'title'
print soup.a # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
print soup.find(id="link3") # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
print soup.find_all('a') # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
<title>The Dormouse's story</title>
title
The Dormouse's story
head
<p class="title"><b>The Dormouse's story</b></p>
[u'title']
<a class="sister" href="http://example.com/elsie" id="link1"
<a class="sister" href="http://example.com/tillie" id="link3"
[<a class="sister" href="http://example.com/elsie" id="link1"
http://example.com/elsie
http://example.com/lacie
http://example.com/tillie
The Dormouse's story
The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...
5 / 20

Example #3
head_tag = soup.head
print head_tag # <head><title>The Dormouse's story</title></head>
print head_tag.contents # [<title>The Dormouse's story</title>]
print head_tag.string # u'The Dormouse's story'
for child in head_tag.descendants: # <title>The Dormouse's story</title>
print(child) # The Dormouse's story
# -------
title_tag = head_tag.contents[0]
print title_tag # <title>The Dormouse's story</title>
print title_tag.contents # [u'The Dormouse's story']
print title_tag.string # u'The Dormouse's story'
text = title_tag.contents[0]
# print text.contents # AttributeError: 'NavigableString' object has no attribute 'contents'
for child in title_tag.children:
print(child) # The Dormouse's story
<head><title>The Dormouse's story</title></head>
[<title>The Dormouse's story</title>]
The Dormouse's story
<title>The Dormouse's story</title>
The Dormouse's story
# -------
<title>The Dormouse's story</title>
[u"The Dormouse's story"]
The Dormouse's story
The Dormouse's story
6 / 20

Example #4
# print soup.contents
print len(soup.contents) # 1
print soup.contents[0].name # u'html'
print len(list(soup.children)) # 1
print len(list(soup.descendants)) # 25
# --------
print(soup.html.string) # None
#for string in soup.strings:
# print(repr(string))
for string in soup.stripped_strings:
print(repr(string))
1
html
1
25
None
# --------
u"The Dormouse's story"
u"The Dormouse's story"
u'Once upon a time there were three little sisters; and their names were'
u'Elsie'
u','
u'Lacie'
u'and'
u'Tillie'
u';\nand they lived at the bottom of a well.'
u'...'
7 / 20

Example #5
soup = BeautifulSoup('<b class="boldest">Extremely bold</b>'
tag = soup.b
print type(tag)
print tag.name # u'b'
tag.name = "blockquote"
print tag # <blockquote class="boldest">Extremely bold</blockquote>
print tag['class'] # u'boldest'
print tag.attrs # {u'class': u'boldest'}
tag['class'] = 'verybold'
tag['id'] = 1
print tag # <blockquote class="verybold" id="1">Extremely bold</blockquote>
# -------------
del tag['class']
del tag['id']
print tag # <blockquote>Extremely bold</blockquote>
#print tag['class'] # KeyError: 'class'
print(tag.get('class')) # None
<class 'bs4.element.Tag'>
b
<blockquote class="boldest">Extremely bold</blockquote>
['boldest']
{'class': ['boldest']}
<blockquote class="verybold" id="1">Extremely bold</blockquote
# -------------
<blockquote>Extremely bold</blockquote>
None
8 / 20

Example #6
class_soup = BeautifulSoup('<p class="body strikeout"></p>'
print class_soup.p['class'] # ["body", "strikeout"]
class_soup = BeautifulSoup('<p class="body"></p>')
print class_soup.p['class'] # ["body"]
id_soup = BeautifulSoup('<p id="my id"></p>')
print id_soup.p['id'] # 'my id'
# ----------
rel_soup = BeautifulSoup('<p>Back to the <a rel="index">homepage</a></p>'
print rel_soup.a['rel'] # ['index']
rel_soup.a['rel'] = ['index', 'contents']
print(rel_soup.p) # <p>Back to the <a rel="index contents">homepage</a></p>
# ----------
xml_soup = BeautifulSoup('<p class="body strikeout"></p>',
print xml_soup.p['class'] # u'body strikeout'
['body', 'strikeout']
['body']
my id
# ----------
['index']
<p>Back to the <a rel="index contents">homepage</a></p>
# ----------
body strikeout
When you turn a tag back into a string, multiple attribute
values are consolidated.
If you parse a document as XML, there are no multi-
valued attributes.
The rel attribute specifies the relationship between the
current document and the linked document (Only used if
the href attribute is present).
9 / 20


Easy Web Scraping
@miguelgrinberg
10 / 20

PyCon 2014 - Page Index
11 / 20

PyCon 2014 - Page Details
12 / 20

PyCon 2014 - YouTube
13 / 20

Test #1
import requests
import bs4
response = requests.get('http://pyvideo.org/category/50/pycon-us-2014'
soup = bs4.BeautifulSoup(response.text)
# atags = soup.select('div#video-summary-content a[href^=/video]')
links = [a.attrs.get('href') for a in soup.select('div#video-summary-content a[href^=/video]'
print links[1:5]
['/video/2676/2d3d-graphics-with-python-on-mobile-platforms'
links = [a.attrs.get('href') for a in soup.select('div#video-summary-content strong a[href^=/video]'
pycon-scraper.py #1
import bs4
import re
import requests
import argparse
from multiprocessing.pool import ThreadPool as Pool
root_url = 'http://pyvideo.org'
index_url = root_url + '/category/50/pycon-us-2014'
def get_video_page_urls():
response = requests.get(index_url)
soup = bs4.BeautifulSoup(response.text)
allvids = [a.attrs.get('href') for a in soup.select('div#video-summary-content strong a[href^=/video]'
return allvids[0:10]
14 / 20

Test #2
root_url = 'http://pyvideo.org'
video_page_url = '/video/2668/writing-restful-web-services-with-flask'
video_data = {}
response = requests.get(root_url + video_page_url)
soup = bs4.BeautifulSoup(response.text)
video_data['title'] = soup.select('div#videobox h3')[0].get_text()
video_data['speakers'] = [a.get_text() for a in soup.select(
video_data['youtube_url'] = soup.select('div#sidebar a[href^=http://www.youtube.com]'
print video_data
{'speakers': [u'Miguel Grinberg'], 'youtube_url': u'http://www.youtube.com/watch?v=px_vg9Far1Y'
pycon-scraper.py #2
def get_video_data(video_page_url):
video_data = {}
response = requests.get(root_url + video_page_url)
soup = bs4.BeautifulSoup(response.text)
video_data['title'] = soup.select('div#videobox h3')[0].get_text()
video_data['speakers'] = [a.get_text() for a in soup.select(
video_data['youtube_url'] = soup.select('div#sidebar a[href^=http://www.youtube.com]'
# ...
return video_data
15 / 20

Test #3
import requests
import bs4
import re
response = requests.get('https://www.youtube.com/watch?v=px_vg9Far1Y'
soup = bs4.BeautifulSoup(response.text)
video_data['views'] = int(re.sub('[^0-9]', '', soup.select('.watch-view-count'
video_data['likes'] = int(re.sub('[^0-9]', '',soup.select('.like-button-renderer-like-button span.yt-uix-button-content'
video_data['dislikes'] = int(re.sub('[^0-9]', '',soup.select(
print video_data
{'speakers': [u'Miguel Grinberg'], 'views': 11908, 'title':
pycon-scraper.py #3
def get_video_data(video_page_url):
# ...
# initialize counters
video_data['views'] = 0
video_data['likes'] = 0
video_data['dislikes'] = 0
try:
response = requests.get(video_data['youtube_url'], headers={
soup = bs4.BeautifulSoup(response.text)
video_data['views'] = int(re.sub('[^0-9]', '', soup.select(
video_data['likes'] = int(re.sub('[^0-9]', '',soup.select(
video_data['dislikes'] = int(re.sub('[^0-9]', '',soup.select(
except:
# some or all of the counters could not be scraped
pass
return video_data
16 / 20

pycon-scraper.py #4
Notes
def show_video_stats(options):
pool = Pool(8)
video_page_urls = get_video_page_urls()
results = pool.map(get_video_data, video_page_urls)
# -----
func(video): return video[options.sort]
def parse_args():
parser = argparse.ArgumentParser(description='Show PyCon 2014 video statistics.'
parser.add_argument('--sort', metavar='FIELD', choices=[
parser.add_argument('--max', metavar='MAX', type=int, help=
parser.add_argument('--csv', action='store_true', default=
parser.add_argument('--workers', type=int, default=8, help=
return parser.parse_args()
# ex: python pycon-scraper.py --sort views --max 25 --workers 8
def show_video_stats(options):
pool = Pool(options.workers)
video_page_urls = get_video_page_urls()
results = sorted(pool.map(get_video_data, video_page_urls), key=
print len(results)
max = options.max
if max is None or max > len(results):
max = len(results)
if options.csv:
print(u'"title","speakers", "views","likes","dislikes"'
else:
print(u'Views +1 -1 Title (Speakers)')
for i in range(max):
if options.csv:
print(u'"{0}","{1}",{2},{3},{4}'.format(results[i][
else:
print(u'{0:5d} {1:3d} {2:3d} {3} ({4})'.format(results[i][
if __name__ == '__main__':
show_video_stats(parse_args())
17 / 20

$> python pycon-scraper.py --sort views
Views +1 -1 Title (Speakers)
8608 80 10 Analyzing Rap Lyrics with Python (Julie Lavoie)
7302 28 1 Building the App (Mike Bayer)
4603 25 3 2D/3D graphics with Python on mobile platforms (Niko Skrypnik)
4056 30 0 Designing Django's Migrations (Andrew Godwin)
3923 41 0 Cheap Helicopters In My Living Room (Ned Jackson Lovely)
3407 36 2 A Scenic Drive through the Django Request-Response Cycle (Dan Langer)
3343 33 0 Advanced techniques for Web functional testing (Julien Phalip)
1598 28 0 Data intensive biology in the cloud: instrumenting ALL the things(C. Titus Brown)
941 8 0 Deliver Your Software In An Envelope (Augie Fackler, Nathaniel Manista)
751 1 0 Closing address - PyCon 2014 (2014/04/13) ()
18 / 20

References
1. Beautiful Soup Documentation
2. Easy Web Scraping with Python
3. Generate statistics about PyCon 2014 videos
19 / 20


END
Eueung Mulyana
http://eueung.github.io/python/bs4
Python CodeLabs | Attribution-ShareAlike CC BY-SA
20 / 20