Example #1
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.prettify())
<html>
<head>
<title>
The Dormouse's story
</title>
</head>
<body>
<p class="title">
<b>
The Dormouse's story
</b>
</p>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1"
<a class="sister" href="http://example.com/lacie" id="link2"
<a class="sister" href="http://example.com/tillie" id="link3"
and they lived at the bottom of a well.
</p>
<p class="story"> ... </p>
</body>
</html>
4 / 20
Example #2
for link in soup.find_all('a'):
print(link.get('href'))
print(soup.get_text())
print soup.title # <title>The Dormouse's story</title>
print soup.title.name # u'title'
print soup.title.string # u'The Dormouse's story'
print soup.title.parent.name # u'head'
print soup.p # <p class="title"><b>The Dormouse's story</b></p>
print soup.p['class'] # u'title'
print soup.a # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
print soup.find(id="link3") # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
print soup.find_all('a') # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
<title>The Dormouse's story</title>
title
The Dormouse's story
head
<p class="title"><b>The Dormouse's story</b></p>
[u'title']
<a class="sister" href="http://example.com/elsie" id="link1"
<a class="sister" href="http://example.com/tillie" id="link3"
[<a class="sister" href="http://example.com/elsie" id="link1"
http://example.com/elsie
http://example.com/lacie
http://example.com/tillie
The Dormouse's story
The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...
5 / 20
Example #3
head_tag = soup.head
print head_tag # <head><title>The Dormouse's story</title></head>
print head_tag.contents # [<title>The Dormouse's story</title>]
print head_tag.string # u'The Dormouse's story'
for child in head_tag.descendants: # <title>The Dormouse's story</title>
print(child) # The Dormouse's story
# -------
title_tag = head_tag.contents[0]
print title_tag # <title>The Dormouse's story</title>
print title_tag.contents # [u'The Dormouse's story']
print title_tag.string # u'The Dormouse's story'
text = title_tag.contents[0]
# print text.contents # AttributeError: 'NavigableString' object has no attribute 'contents'
for child in title_tag.children:
print(child) # The Dormouse's story
<head><title>The Dormouse's story</title></head>
[<title>The Dormouse's story</title>]
The Dormouse's story
<title>The Dormouse's story</title>
The Dormouse's story
# -------
<title>The Dormouse's story</title>
[u"The Dormouse's story"]
The Dormouse's story
The Dormouse's story
6 / 20
Example #4
# print soup.contents
print len(soup.contents) # 1
print soup.contents[0].name # u'html'
print len(list(soup.children)) # 1
print len(list(soup.descendants)) # 25
# --------
print(soup.html.string) # None
#for string in soup.strings:
# print(repr(string))
for string in soup.stripped_strings:
print(repr(string))
1
html
1
25
None
# --------
u"The Dormouse's story"
u"The Dormouse's story"
u'Once upon a time there were three little sisters; and their names were'
u'Elsie'
u','
u'Lacie'
u'and'
u'Tillie'
u';\nand they lived at the bottom of a well.'
u'...'
7 / 20
Example #6
class_soup = BeautifulSoup('<p class="body strikeout"></p>'
print class_soup.p['class'] # ["body", "strikeout"]
class_soup = BeautifulSoup('<p class="body"></p>')
print class_soup.p['class'] # ["body"]
id_soup = BeautifulSoup('<p id="my id"></p>')
print id_soup.p['id'] # 'my id'
# ----------
rel_soup = BeautifulSoup('<p>Back to the <a rel="index">homepage</a></p>'
print rel_soup.a['rel'] # ['index']
rel_soup.a['rel'] = ['index', 'contents']
print(rel_soup.p) # <p>Back to the <a rel="index contents">homepage</a></p>
# ----------
xml_soup = BeautifulSoup('<p class="body strikeout"></p>',
print xml_soup.p['class'] # u'body strikeout'
['body', 'strikeout']
['body']
my id
# ----------
['index']
<p>Back to the <a rel="index contents">homepage</a></p>
# ----------
body strikeout
When you turn a tag back into a string, multiple attribute
values are consolidated.
If you parse a document as XML, there are no multi-
valued attributes.
The rel attribute specifies the relationship between the
current document and the linked document (Only used if
the href attribute is present).
9 / 20
Easy Web Scraping
@miguelgrinberg
10 / 20
PyCon 2014 - Page Index
11 / 20
PyCon 2014 - Page Details
12 / 20
PyCon 2014 - YouTube
13 / 20
Test #1
import requests
import bs4
response = requests.get('http://pyvideo.org/category/50/pycon-us-2014'
soup = bs4.BeautifulSoup(response.text)
# atags = soup.select('div#video-summary-content a[href^=/video]')
links = [a.attrs.get('href') for a in soup.select('div#video-summary-content a[href^=/video]'
print links[1:5]
['/video/2676/2d3d-graphics-with-python-on-mobile-platforms'
links = [a.attrs.get('href') for a in soup.select('div#video-summary-content strong a[href^=/video]'
pycon-scraper.py #1
import bs4
import re
import requests
import argparse
from multiprocessing.pool import ThreadPool as Pool
root_url = 'http://pyvideo.org'
index_url = root_url + '/category/50/pycon-us-2014'
def get_video_page_urls():
response = requests.get(index_url)
soup = bs4.BeautifulSoup(response.text)
allvids = [a.attrs.get('href') for a in soup.select('div#video-summary-content strong a[href^=/video]'
return allvids[0:10]
14 / 20
Test #3
import requests
import bs4
import re
response = requests.get('https://www.youtube.com/watch?v=px_vg9Far1Y'
soup = bs4.BeautifulSoup(response.text)
video_data['views'] = int(re.sub('[^0-9]', '', soup.select('.watch-view-count'
video_data['likes'] = int(re.sub('[^0-9]', '',soup.select('.like-button-renderer-like-button span.yt-uix-button-content'
video_data['dislikes'] = int(re.sub('[^0-9]', '',soup.select(
print video_data
{'speakers': [u'Miguel Grinberg'], 'views': 11908, 'title':
pycon-scraper.py #3
def get_video_data(video_page_url):
# ...
# initialize counters
video_data['views'] = 0
video_data['likes'] = 0
video_data['dislikes'] = 0
try:
response = requests.get(video_data['youtube_url'], headers={
soup = bs4.BeautifulSoup(response.text)
video_data['views'] = int(re.sub('[^0-9]', '', soup.select(
video_data['likes'] = int(re.sub('[^0-9]', '',soup.select(
video_data['dislikes'] = int(re.sub('[^0-9]', '',soup.select(
except:
# some or all of the counters could not be scraped
pass
return video_data
16 / 20
pycon-scraper.py #4
Notes
def show_video_stats(options):
pool = Pool(8)
video_page_urls = get_video_page_urls()
results = pool.map(get_video_data, video_page_urls)
# -----
func(video): return video[options.sort]
def parse_args():
parser = argparse.ArgumentParser(description='Show PyCon 2014 video statistics.'
parser.add_argument('--sort', metavar='FIELD', choices=[
parser.add_argument('--max', metavar='MAX', type=int, help=
parser.add_argument('--csv', action='store_true', default=
parser.add_argument('--workers', type=int, default=8, help=
return parser.parse_args()
# ex: python pycon-scraper.py --sort views --max 25 --workers 8
def show_video_stats(options):
pool = Pool(options.workers)
video_page_urls = get_video_page_urls()
results = sorted(pool.map(get_video_data, video_page_urls), key=
print len(results)
max = options.max
if max is None or max > len(results):
max = len(results)
if options.csv:
print(u'"title","speakers", "views","likes","dislikes"'
else:
print(u'Views +1 -1 Title (Speakers)')
for i in range(max):
if options.csv:
print(u'"{0}","{1}",{2},{3},{4}'.format(results[i][
else:
print(u'{0:5d} {1:3d} {2:3d} {3} ({4})'.format(results[i][
if __name__ == '__main__':
show_video_stats(parse_args())
17 / 20
$> python pycon-scraper.py --sort views
Views +1 -1 Title (Speakers)
8608 80 10 Analyzing Rap Lyrics with Python (Julie Lavoie)
7302 28 1 Building the App (Mike Bayer)
4603 25 3 2D/3D graphics with Python on mobile platforms (Niko Skrypnik)
4056 30 0 Designing Django's Migrations (Andrew Godwin)
3923 41 0 Cheap Helicopters In My Living Room (Ned Jackson Lovely)
3407 36 2 A Scenic Drive through the Django Request-Response Cycle (Dan Langer)
3343 33 0 Advanced techniques for Web functional testing (Julien Phalip)
1598 28 0 Data intensive biology in the cloud: instrumenting ALL the things(C. Titus Brown)
941 8 0 Deliver Your Software In An Envelope (Augie Fackler, Nathaniel Manista)
751 1 0 Closing address - PyCon 2014 (2014/04/13) ()
18 / 20
References
1. Beautiful Soup Documentation
2. Easy Web Scraping with Python
3. Generate statistics about PyCon 2014 videos
19 / 20
END
Eueung Mulyana
http://eueung.github.io/python/bs4
Python CodeLabs | Attribution-ShareAlike CC BY-SA
20 / 20