Web Scraping is BS

Web Scraping is BS
John Downs - Software Engineer at Yodle

My Scraper
import requests
from bs4 import BeautifulSoup
!
def get_front_page():
target = "https://news.ycombinator.com"
frontpage = requests.get(target)
news_soup =
BeautifulSoup(frontpage.text)
return news_soup

def find_interesting_links(soup):
search_attrs = {'align': 'right',
'class': 'title'}
items = soup.findAll('td', search_attrs)
links = []
for i in items:
siblings = i.find_next_siblings(limit=2)
post_id = siblings[0].a.attrs['td'][3:]
link = siblings[1].a.attrs['href']
score = get_score(soup, post_id)
comments = get_comments(soup, post_id)
!
if 'python' in title.lower()
or (score > 50 and comments > 10):
links.append({'link': link,
'title': title,
'score': score,
'comments': comments})
return links

def get_score(soup, post_id):
span_tag = soup.find(‘span',
id='score_' + post_id)
return int(span_tag.text.split()[0])
!
!
!
def get_comments(soup, post_id):
a_tag = soup.find('a',
href='item?id=' + post_id)
return int(span_tag.text.split()[0])
!

def add_to_pocket(consumer_key, access_token, url):
target = 'https://getpocket.com/v3/add'
request_params = {
'url': url,
'consumer_key': consumer_key,
'access_token': access_token}
result = requests.post(target,
data=request_params)
return result.text
!
if __name__ == '__main__':
soup = get_front_page()
pocket = partial(add_to_pocket,
consumer_key,
access_token)
results = find_interesting_links(soup)
print(results)
for r in results:
print(pocket(r['link']) )

Searching with BS
• find() / find_all()
• find_parent() / find_parents()
• find_next_sibling() / find_next_siblings()
• find_previous_sibling() / find_previous_siblings()
• find_next() / find_all_next()
• find_previous() / find_all_previous()

Filters
html_doc = """
<html>
<head><title>The Dormouse's story</title></head>
The Dormouse's story

Once upon a time there were three little sisters;
and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
and
<a href="http://example.com/tillie" class="sister" id=“link3">Tillie</
a>;
and they lived at the bottom of a well.
!
...
“”"
!
soup = BeautifulSoup(html_doc)
!

Filters - Strings
>>> soup.find_all('b')
!
[The Dormouse's story]

Filters - Regex
regex = re.compile("^b")
for tag in soup.find_all(regex):
print(tag.name)
!
!
body
b

Filters - Lists
soup.find_all(["a", "b"])
!
[The Dormouse's story,
<a class="sister" href="http://example.com/elsie"
id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie"
id="link2">Lacie</a>,
<a class="sister" href="http://example.com/tillie"
id="link3">Tillie</a>]

Filters - Functions
def has_class_but_no_id(tag):
return tag.has_attr('class') and
not tag.has_attr('id')

Filters - Functions
soup.find_all(has_class_but_no_id)
!
[The Dormouse's story,
Once upon a time there were...,
...]

The API
find_all(name, attrs, recursive,  
text, limit, **kwargs)

The API
name: A string that matches tags

The API
attrs: a dictionary of html attributes to match
soup.find_all("a", attrs={"class": "sister"})
[<a class="sister" href="http://example.com/elsie"
id="link1">Elsie</a>, 
<a class="sister" href="http://example.com/lacie"
id="link2">Lacie</a>, 
<a class="sister" href="http://example.com/tillie"
id="link3">Tillie</a>]

The API
recursive: a boolean value to seach grandchildren

The API
text: search for text instead of tags 
 
soup.find_all(text=re.compile("Dormouse"))
[u"The Dormouse's story", u"The Dormouse's story"]

The API
limit: an int to control the number of items returned

The API
keyword: A keyword argument will search for a tag
with that attribute
!
>>>soup.find_all(id=‘link2’)
[<a class="sister" href="http://example.com/lacie"
id="link2">Lacie</a>]

Navigating with BS
The easiest way to navigate elements down the tree
is to use the dot notation.
>>>soup.head 
<head><title>The Dormouse's story</
title></head>
>>>soup.title 
<title>The Dormouse's story</title>

Navigating with BS
You can look at the children of an element
with .contents
>>>head_tag = soup.head 
>>>head_tag 
<head><title>The Dormouse's story</
title></head>
>>>head_tag.contents 
[<title>The Dormouse's story</title>]

Fifty Years Ago
import requests
import datetime
!
today = datetime.date.today()
month = today.strftime(‘%B')
fyo = str(today.year - 50)
url = 'http://en.wikipedia.org/wiki/' + month + '_' + fyo
target = month + '_' + str(today.day) + '.'
!
data = requests.get(url).text
soup = BeautifulSoup(data)
!
contents = soup.find('div', id='toc')
a = contents.findAll(
lambda tag: tag.name == ‘span' and
tag.has_attr(‘id'))
!
for event in a.ul.findAll(‘li'):
print(event.text)

RTFM
• http://docs.python-requests.org/en/latest
• http://www.crummy.com/software/BeautifulSoup/
• http://lxml.de/
• http://jakeaustwick.me/python-web-scraping-resource/
• https://github.com/jdowns/scraper
• john.downs@acm.org

Web Scraping is BS

Recomendados

Recomendados

Más contenido relacionado

La actualidad más candente

La actualidad más candente (20)

Destacado

Destacado (16)

Similar a Web Scraping is BS

Similar a Web Scraping is BS (20)

Último

Último (20)

Web Scraping is BS