import pandas as pd
import numpy as np
import os
import requests
import warnings
warnings.filterwarnings("ignore")
I'm going to investigate the documentation provided by the API and explore a couple of responses before I dig into the exercises.
# I can make a request to the url below and use the `.json` method on my results to return a...
base_url = 'https://python.zach.lol'
type(requests.get(base_url).json())
# I have two choices of paths I can add to my base url, '/api/v1' and '/documentation'.
requests.get(base_url).json()
# I'll create a doc_url to request help with using this api below.
doc_url = base_url + '/documentation'
# I have two keys in the dictionary returned from my request.
requests.get(doc_url).json().keys()
# I can print the value for the status key.
print(requests.get(doc_url).json()['status'])
# I can print the value for the payload key.
print(requests.get(doc_url).json()['payload'])
This tells me that I can use 3 different endpoints to access data by adding stores
, items
, or sales
, to my base_url + /api/v1/
like below:
'https://python.zach.lol/api/v1/items'
'https://python.zach.lol/api/v1/stores'
'https://python.zach.lol/api/v1/sales'
There is also a page parameter that I can add to each of these endpoints to navigate through multiple pages of results.
'?page=n'
For example:
'https://python.zach.lol/api/v1/items?page=1'
# I will create my api url.
api_url = base_url + '/api/v1/'
Using the code from the lesson as a guide, create a dataframe named items that has all of the data for items.
# This submits the request for the first page of results and stores the results in response.
# My request was successful.
response = requests.get(api_url + 'items')
response.ok
# Use .json() method on my response to get a dictionary object; I'll store it in `data` variable.
data = response.json()
print(type(data))
data
# List the keys in my dictionary object; I see payload and status.
data.keys()
I can see above that 'payload'
is also a dictionary object; I also see that the first key, items
, has a value that is a list of dictionaries. I can check out all of the key:value pairs in payload
to see what is of use to me.
# Look at the keys in the payload dictionary.
data['payload'].keys()
# I see that the `items` list holds 20 dictionaries (items).
len(data['payload']['items'])
# I'll check out just the first 2 dictionaries (items) in the list.
data['payload']['items'][:2]
# Look at the values of the other keys in the 'payload' dictionary.
print(f"The current page of the results from my request is {data['payload']['page']}.")
print(f"The next page of the results from my request is {data['payload']['next_page']}.")
print(f"The total number of pages in the results from my request is {data['payload']['max_page']}.")
print(f"The previous page in the results from my request is {data['payload']['previous_page']}.")
# I create a list variable to hold the list of the 20 items from page one.
items = data['payload']['items']
print(len(items))
type(items)
# 'next_page' returns the path and page param for the second page of results.
data['payload']['next_page']
# Submit a request for the the next page and store it in the `response` variable.
response = requests.get(base_url + data['payload']['next_page'])
# Use the `.json()` method to return a dictionary object like I did above for page 1.
data = response.json()
# Add items from the second page to our list using `.extend()`
items.extend(data['payload']['items'])
# The `items` list now contains 40 items (dictionaries).
len(items)
# Our next page is page 3 of items out of 3
data['payload']['next_page']
# Grab the next page in the same way and add items to my `items` list.
# I see there are only 10 items on this last page.
response = requests.get(base_url + data['payload']['next_page'])
data = response.json()
len(data['payload']['items'])
# Add the last 10 items to my `items` list, which now contains a total of 50 items.
items.extend(data['payload']['items'])
len(items)
There is no next page, so data['payload']['next_page']
returns None
. This could come in handy when we write our function later to automate the above process.
data['payload']['next_page'] == None
# Use our items, our list of dictionaries, to create a DataFrame
items_df = pd.DataFrame(items)
print(f'The items_df has the shape {items_df.shape}.\n')
items_df.head(2)
# I want to see I how many pages of stores I have to request.
api_url = base_url + '/api/v1/'
response = requests.get(api_url + 'stores')
data = response.json()
data['payload']['max_page']
# This time I want to grab stores instead of items.
data['payload'].keys()
# Again, I have a list of dictionaries; I can convert this into a pandas DataFrame now.
stores = data['payload']['stores'][:2]
stores_df = pd.DataFrame(stores)
print(f"My stores_df has the shape {stores_df.shape}")
stores_df.head()
Extract the data for sales. Your code should continue fetching data from the next page until all of the data is extracted.
api_url = base_url + '/api/v1/'
response = requests.get(api_url + 'sales')
data = response.json()
data['payload']['max_page']
This will request the data from the API and save the individual dataframes to csv files for each path name I pass in, one at a time.
def get_df(name):
"""
This function takes in the string
'items', 'stores', or 'sales' and
returns a df containing all pages and
creates a .csv file for future use.
"""
base_url = 'https://python.zach.lol'
api_url = base_url + '/api/v1/'
response = requests.get(api_url + name)
data = response.json()
# create list from 1st page
my_list = data['payload'][name]
# loop through the pages and add to list
while data['payload']['next_page'] != None:
response = requests.get(base_url + data['payload']['next_page'])
data = response.json()
my_list.extend(data['payload'][name])
# Create DataFrame from list
df = pd.DataFrame(my_list)
# Write DataFrame to csv file for future use
df.to_csv(name + '.csv')
return df
items_df = get_df('items')
print(items_df.shape)
items_df.head()
stores_df = get_df('stores')
print(stores_df.shape)
stores_df.head()
sales_df = get_df('sales')
print(sales_df.shape)
sales_df.head()
# I can see all of my dataframes above, so I know how to join and what to drop.
df = pd.merge(sales_df, stores_df, left_on='store', right_on='store_id').drop(columns={'store'})
df.head(2)
df = pd.merge(df, items_df, left_on='item', right_on='item_id').drop(columns={'item'})
df.head(2)
df.shape
There is another way that I can approach pagination of APIs using the params
parameter with the .get()
method. The documentation for the API informed me that "All endpoints accept a page
parameter that can be used to navigate through the results."
Above we used the value of data['payload']['next_page']
to provide the path and query parameter, '/api/v1/items?page=n'
, that we concatenated to our base_url, https://python.zach.lol
, to access each page.
Below, I will instead pass a dictionary to params
to 'turn the pages' so to speak. This is just a different way to access the data and may come in handy when you work with different APIs. If it's TMI right now, skip it; the above method works fine for this API.
default
requests.get(url, params={key: value}, args)
Here are more parameters that can be used with the .get()
method.
# Create endpoints for use below.
items_url = 'https://python.zach.lol/api/v1/items'
stores_url = 'https://python.zach.lol/api/v1/stores'
sales_url = 'https://python.zach.lol/api/v1/sales'
# Create an empty list names `results`.
results = []
# Loop through the pages of my endpoint until my reponse is empty.
for i in range(3):
response = requests.get(items_url, params = {"page": i+1})
# We have reached the end of the results if the response length is 0.
if len(response.json()) == 0:
break
else:
# Convert my response to a dictionary and store as variable `data`.
data = response.json()
# Add the list of dictionaries to my list
results.extend(data['payload']['items'])
print(results[:2])
len(results)
def get_df_params(name):
"""
This function takes in the string
'items', 'stores', or 'sales' and
returns a df containing all pages and
creates a .csv file for future use.
"""
# Create an empty list names `results`.
results = []
# Create api_url variable
api_url = 'https://python.zach.lol/api/v1/'
# Loop through the page parameters until an empty response is returned.
for i in range(3):
response = requests.get(items_url, params = {"page": i+1})
# We have reached the end of the results
if len(response.json()) == 0:
break
else:
# Convert my response to a dictionary and store as variable `data`
data = response.json()
# Add the list of dictionaries to my list
results.extend(data['payload'][name])
# Create DataFrame from list
df = pd.DataFrame(results)
# Write DataFrame to csv file for future use
df.to_csv(name + '.csv')
return df
get_df_params('items').head()
# This helper function returns the same data as my other function.
get_df_params('items').shape
get_store_date()
Function¶Create a function that checks for a csv file, and if one doesn't exist it creates one.
The function should also create one large df using all three df.
Create this function using either of our helper functions above; your choice.
def get_store_data():
"""
This function checks for csv files
for items, sales, stores, and big_df
if there are none, it creates them.
It returns one big_df of merged dfs.
"""
# check for csv files or create them
if os.path.isfile('items.csv'):
items_df = pd.read_csv('items.csv', index_col=0)
else:
items_df = get_df('items')
if os.path.isfile('stores.csv'):
stores_df = pd.read_csv('stores.csv', index_col=0)
else:
stores_df = get_df('stores')
if os.path.isfile('sales.csv'):
sales_df = pd.read_csv('sales.csv', index_col=0)
else:
sales_df = get_df('sales')
if os.path.isfile('big_df.csv'):
df = pd.read_csv('big_df.csv', index_col=0)
return df
else:
# merge all of the DataFrames into one
df = pd.merge(sales_df, stores_df, left_on='store', right_on='store_id').drop(columns={'store'})
df = pd.merge(df, items_df, left_on='item', right_on='item_id').drop(columns={'item'})
# write merged DateTime df with all data to directory for future use
df.to_csv('big_df.csv')
return df
df = get_store_data()
df.head(2)
df.shape
df.info()
url = 'https://raw.githubusercontent.com/jenfly/opsd/master/opsd_germany_daily.csv'
df = pd.read_csv(url)
df.head()
df.info()
opsd_germany_daily()
Function¶def opsd_germany_daily():
"""
This function uses or creates the
opsd_germany_daily csv and returns a df.
"""
if os.path.isfile('opsd_germany_daily.csv'):
df = pd.read_csv('opsd_germany_daily.csv', index_col=0)
else:
url = 'https://raw.githubusercontent.com/jenfly/opsd/master/opsd_germany_daily.csv'
df = pd.read_csv(url)
df.to_csv('opsd_germany_daily.csv')
return df
gdf = opsd_germany_daily()
gdf.head(2)
gdf.info()