import pandas as pd
import numpy as np
from requests import get
import re
from bs4 import BeautifulSoup
import os
Goals: Write a function to scrape urls from main Codeup blog web page and write a function that returns a dictionary of blog titles and text for each blog page.
Here I use the .find()
method on my soup with the <h1>
tag. As always, there is no one way to accomplish our task, so I'm demonstrating one way to scrape the headline, not THE way to scrape the headline.
url = 'https://codeup.com/codeups-data-science-career-accelerator-is-here/'
headers = {'User-Agent': 'Codeup Data Science'}
response = get(url, headers=headers)
response.ok
# Here's our long string of HTML; we'll use response.text to make our soup object.
print(type(response.text))
# Create our Soup object by passing our HTML string and choice of parser.
soup = BeautifulSoup(response.text, 'html.parser')
# Now we have our BeautifulSoup object and can use its built-in methods and attributes.
print(type(soup))
# The h1 element holds my title.
title = soup.find('h1').text
title
content = soup.find('div', class_="jupiterx-post-content").text
print(content)
print(type(content))
# Create a helper function that requests and parses HTML returning a soup object.
def make_soup(url):
'''
This helper function takes in a url and requests and parses HTML
returning a soup object.
'''
headers = {'User-Agent': 'Codeup Data Science'}
response = get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
return soup
def get_blog_articles(urls, cached=False):
'''
This function takes in a list of Codeup Blog urls and a parameter
with default cached == False which scrapes the title and text for each url,
creates a list of dictionaries with the title and text for each blog,
converts list to df, and returns df.
If cached == True, the function returns a df from a json file.
'''
if cached == True:
df = pd.read_json('big_blogs.json')
# cached == False completes a fresh scrape for df
else:
# Create an empty list to hold dictionaries
articles = []
# Loop through each url in our list of urls
for url in urls:
# Make request and soup object using helper
soup = make_soup(url)
# Save the title of each blog in variable title
title = soup.find('h1').text
# Save the text in each blog to variable text
content = soup.find('div', class_="jupiterx-post-content").text
# Create a dictionary holding the title and content for each blog
article = {'title': title, 'content': content}
# Add each dictionary to the articles list of dictionaries
articles.append(article)
# convert our list of dictionaries to a df
df = pd.DataFrame(articles)
# Write df to a json file for faster access
df.to_json('big_blogs.json')
return df
# Here cached == False, so the function will do a fresh scrape of the urls and write data to a json file.
urls = ['https://codeup.com/codeups-data-science-career-accelerator-is-here/',
'https://codeup.com/data-science-myths/',
'https://codeup.com/data-science-vs-data-analytics-whats-the-difference/',
'https://codeup.com/10-tips-to-crush-it-at-the-sa-tech-job-fair/',
'https://codeup.com/competitor-bootcamps-are-closing-is-the-model-in-danger/']
blogs = get_blog_articles(urls=urls, cached=False)
blogs
# I'm going to hit Codeup's main blog page to scrape the urls and use my new function.
url = 'https://codeup.com/resources/#blog'
soup = make_soup(url)
# I'm filtering my soup to return a list of all anchor elements from my HTML. (view first 2)
urls_list = soup.find_all('a', class_='jet-listing-dynamic-link__link')
urls_list[:2]
# Filter the href attribute value for each anchor element in my list; we scraped 40 urls.
# I'm using a set comprehension to return only unique urls because there are two links for each article.
urls = {link.get('href') for link in urls_list}
# I'm converting my set to a list of urls.
urls = list(urls)
print(f'There are {len(urls)} unique links in our urls list.')
print()
urls
def get_all_urls():
'''
This function scrapes all of the Codeup blog urls from
the main Codeup blog page and returns a list of urls.
'''
# The base url for the main Codeup blog page
url = 'https://codeup.com/resources/#blog'
# Make request and soup object using helper
soup = make_soup(url)
# Create a list of the anchor elements that hold the urls.
urls_list = soup.find_all('a', class_='jet-listing-dynamic-link__link')
# I'm using a set comprehension to return only unique urls because list contains duplicate urls.
urls = {link.get('href') for link in urls_list}
# I'm converting my set to a list of urls.
urls = list(urls)
return urls
# Now I can use my same function with my new function.
# cached == False does a fresh scrape.
big_blogs = get_blog_articles(urls=get_all_urls(), cached=False)
big_blogs.head(10)
big_blogs.info()
# cached == True reads in a df from `big_blogs.json`.
big_blogs = get_blog_articles(urls=get_all_urls(), cached=True)
big_blogs.head()
Goal: Write a function that scrapes the news articles for the following topics:
# Make the soup object using my function.
url = 'https://inshorts.com/en/read/entertainment'
soup = make_soup(url)
# Scrape a ResultSet of all the news cards on the page and inspect the elements on the first card.
cards = soup.find_all('div', class_='news-card')
print(f'There are {len(cards)} news cards on this page.')
print()
cards[0]
# Create a list of titles using the span element and itemprop attribute with text method.
titles = [card.find('span', itemprop='headline').text for card in cards]
titles[:5]
# Create a list of authors using the span element and class attribute with text method.
authors = [card.find('span', class_='author').text for card in cards]
authors[:5]
# Create a list of content strings using the div element and itemprop attribute with text method.
content = [card.find('div', itemprop='articleBody').text for card in cards]
content[:5]
# Create an empty list, articles, to hold the dictionaries for each article.
articles = []
# Loop through each news card on the page and get what we want
for card in cards:
title = card.find('span', itemprop='headline' ).text
author = card.find('span', class_='author').text
content = card.find('div', itemprop='articleBody').text
# Create a dictionary, article, for each news card
article = {'title': title, 'author': author, 'content': content}
# Add the dictionary, article, to our list of dictionaries, articles.
articles.append(article)
# Here we see our list contains 24-25 dictionaries for news cards
print(len(articles))
articles[0]
def get_news_articles(cached=False):
'''
This function with default cached == False does a fresh scrape of inshort pages with topics
business, sports, technology, and entertainment and writes the returned df to a json file.
cached == True returns a df read in from a json file.
'''
# option to read in a json file instead of scrape for df
if cached == True:
df = pd.read_json('articles.json')
# cached == False completes a fresh scrape for df
else:
# Set base_url that will be used in get request
base_url = 'https://inshorts.com/en/read/'
# List of topics to scrape
topics = ['business', 'sports', 'technology', 'entertainment']
# Create an empty list, articles, to hold our dictionaries
articles = []
for topic in topics:
# Create url with topic endpoint
topic_url = base_url + topic
# Make request and soup object using helper
soup = make_soup(topic_url)
# Scrape a ResultSet of all the news cards on the page
cards = soup.find_all('div', class_='news-card')
# Loop through each news card on the page and get what we want
for card in cards:
title = card.find('span', itemprop='headline' ).text
author = card.find('span', class_='author').text
content = card.find('div', itemprop='articleBody').text
# Create a dictionary, article, for each news card
article = ({'topic': topic,
'title': title,
'author': author,
'content': content})
# Add the dictionary, article, to our list of dictionaries, articles.
articles.append(article)
# Create a DataFrame from list of dictionaries
df = pd.DataFrame(articles)
# Write df to json file for future use
df.to_json('articles.json')
return df
# Test our function with cached == False to do a freash scrape and create `articles.json` file.
df = get_news_articles(cached=False)
df.head()
df.topic.value_counts()
df.info()
# Test our function to read in the df from `articles.csv`
df = get_news_articles(cached=True)
df.head()
df.info()