# Copyright (c) 2021, salesforce.com, inc. # All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # For full license text, see the LICENSE file in the repo root # or https://opensource.org/licenses/BSD-3-Clause import bz2 import os import pickle import queue import threading import urllib.request as urllib2 import pandas as pd from bs4 import BeautifulSoup class DatasetCovidUnemploymentUS: """ Class to load COVID-19 unemployment data for the US states. Source: https://www.bls.gov/lau/ """ def __init__(self, data_dir="", download_latest_data=True): if not os.path.exists(data_dir): print( "Creating a dynamic data directory to store COVID-19 " "unemployment data: {}".format(data_dir) ) os.makedirs(data_dir) filename = "monthly_us_unemployment.bz2" if download_latest_data or filename not in os.listdir(data_dir): # Construct the U.S. state to FIPS code mapping state_fips_df = pd.read_excel( "https://www2.census.gov/programs-surveys/popest/geographies/2017/" "state-geocodes-v2017.xlsx", header=5, ) # remove all statistical areas and cities state_fips_df = state_fips_df.loc[state_fips_df["State (FIPS)"] != 0] self.us_state_to_fips_dict = pd.Series( state_fips_df["State (FIPS)"].values, index=state_fips_df.Name ).to_dict() print( "Fetching the U.S. unemployment data from " "Bureau of Labor and Statistics, and saving it in {}".format(data_dir) ) self.data = self.scrape_bls_data() fp = bz2.BZ2File(os.path.join(data_dir, filename), "wb") pickle.dump(self.data, fp) fp.close() else: print( "Not fetching the U.S. unemployment data from Bureau of Labor and" " Statistics. Using whatever was saved earlier in {}!!".format(data_dir) ) assert filename in os.listdir(data_dir) with bz2.BZ2File(os.path.join(data_dir, filename), "rb") as fp: self.data = pickle.load(fp) fp.close() # Scrape monthly unemployment from the Bureau of Labor Statistics website def get_monthly_bls_unemployment_rates(self, state_fips): with urllib2.urlopen( "https://data.bls.gov/timeseries/LASST{:02d}0000000000003".format( state_fips ) ) as response: html_doc = response.read() soup = BeautifulSoup(html_doc, "html.parser") table = soup.find_all("table")[1] table_rows = table.find_all("tr") unemployment_dict = {} mth2idx = { "Jan": 1, "Feb": 2, "Mar": 3, "Apr": 4, "May": 5, "Jun": 6, "Jul": 7, "Aug": 8, "Sep": 9, "Oct": 10, "Nov": 11, "Dec": 12, } for tr in table_rows[1:-1]: td = tr.find_all("td")[-1] unemp = float("".join([c for c in td.text if c.isdigit() or c == "."])) th = tr.find_all("th") year = int(th[0].text) month = mth2idx[th[1].text] if year not in unemployment_dict: unemployment_dict[year] = {} unemployment_dict[year][month] = unemp return unemployment_dict def scrape_bls_data(self): def do_scrape(us_state, fips, queue_obj): out = self.get_monthly_bls_unemployment_rates(fips) queue_obj.put([us_state, out]) print("Getting BLS Data. This might take a minute...") result = queue.Queue() threads = [ threading.Thread(target=do_scrape, args=(us_state, fips, result)) for us_state, fips in self.us_state_to_fips_dict.items() ] for t in threads: t.start() for t in threads: t.join() monthly_unemployment = {} while not result.empty(): us_state, data = result.get() monthly_unemployment[us_state] = data return monthly_unemployment