Files
ai-econ/ai_economist/datasets/covid19_datasets/us_unemployment.py

129 lines
4.2 KiB
Python

# Copyright (c) 2021, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root
# or https://opensource.org/licenses/BSD-3-Clause
import bz2
import os
import pickle
import queue
import threading
import urllib.request as urllib2
import pandas as pd
from bs4 import BeautifulSoup
class DatasetCovidUnemploymentUS:
"""
Class to load COVID-19 unemployment data for the US states.
Source: https://www.bls.gov/lau/
"""
def __init__(self, data_dir="", download_latest_data=True):
if not os.path.exists(data_dir):
print(
"Creating a dynamic data directory to store COVID-19 "
"unemployment data: {}".format(data_dir)
)
os.makedirs(data_dir)
filename = "monthly_us_unemployment.bz2"
if download_latest_data or filename not in os.listdir(data_dir):
# Construct the U.S. state to FIPS code mapping
state_fips_df = pd.read_excel(
"https://www2.census.gov/programs-surveys/popest/geographies/2017/"
"state-geocodes-v2017.xlsx",
header=5,
)
# remove all statistical areas and cities
state_fips_df = state_fips_df.loc[state_fips_df["State (FIPS)"] != 0]
self.us_state_to_fips_dict = pd.Series(
state_fips_df["State (FIPS)"].values, index=state_fips_df.Name
).to_dict()
print(
"Fetching the U.S. unemployment data from "
"Bureau of Labor and Statistics, and saving it in {}".format(data_dir)
)
self.data = self.scrape_bls_data()
fp = bz2.BZ2File(os.path.join(data_dir, filename), "wb")
pickle.dump(self.data, fp)
fp.close()
else:
print(
"Not fetching the U.S. unemployment data from Bureau of Labor and"
" Statistics. Using whatever was saved earlier in {}!!".format(data_dir)
)
assert filename in os.listdir(data_dir)
with bz2.BZ2File(os.path.join(data_dir, filename), "rb") as fp:
self.data = pickle.load(fp)
fp.close()
# Scrape monthly unemployment from the Bureau of Labor Statistics website
def get_monthly_bls_unemployment_rates(self, state_fips):
with urllib2.urlopen(
"https://data.bls.gov/timeseries/LASST{:02d}0000000000003".format(
state_fips
)
) as response:
html_doc = response.read()
soup = BeautifulSoup(html_doc, "html.parser")
table = soup.find_all("table")[1]
table_rows = table.find_all("tr")
unemployment_dict = {}
mth2idx = {
"Jan": 1,
"Feb": 2,
"Mar": 3,
"Apr": 4,
"May": 5,
"Jun": 6,
"Jul": 7,
"Aug": 8,
"Sep": 9,
"Oct": 10,
"Nov": 11,
"Dec": 12,
}
for tr in table_rows[1:-1]:
td = tr.find_all("td")[-1]
unemp = float("".join([c for c in td.text if c.isdigit() or c == "."]))
th = tr.find_all("th")
year = int(th[0].text)
month = mth2idx[th[1].text]
if year not in unemployment_dict:
unemployment_dict[year] = {}
unemployment_dict[year][month] = unemp
return unemployment_dict
def scrape_bls_data(self):
def do_scrape(us_state, fips, queue_obj):
out = self.get_monthly_bls_unemployment_rates(fips)
queue_obj.put([us_state, out])
print("Getting BLS Data. This might take a minute...")
result = queue.Queue()
threads = [
threading.Thread(target=do_scrape, args=(us_state, fips, result))
for us_state, fips in self.us_state_to_fips_dict.items()
]
for t in threads:
t.start()
for t in threads:
t.join()
monthly_unemployment = {}
while not result.empty():
us_state, data = result.get()
monthly_unemployment[us_state] = data
return monthly_unemployment