adding ai_economist for modding
This commit is contained in:
122
ai_economist/datasets/covid19_datasets/us_policies.py
Normal file
122
ai_economist/datasets/covid19_datasets/us_policies.py
Normal file
@@ -0,0 +1,122 @@
|
||||
# Copyright (c) 2021, salesforce.com, inc.
|
||||
# All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
# For full license text, see the LICENSE file in the repo root
|
||||
# or https://opensource.org/licenses/BSD-3-Clause
|
||||
|
||||
|
||||
import os
|
||||
from datetime import datetime
|
||||
from io import BytesIO
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import requests
|
||||
|
||||
|
||||
class DatasetCovidPoliciesUS:
|
||||
"""
|
||||
Class to load COVID-19 government policies for the US states.
|
||||
Source: https://github.com/OxCGRT/USA-covid-policy
|
||||
|
||||
Other references:
|
||||
- Codebook: https://github.com/OxCGRT/covid-policy-tracker/blob/master/
|
||||
documentation/codebook.md
|
||||
- Index computation methodology: https://github.com/OxCGRT/covid-policy-tracker/
|
||||
blob/master/documentation/index_methodology.md
|
||||
|
||||
Attributes:
|
||||
df: Timeseries dataframe of state-wide policies
|
||||
"""
|
||||
|
||||
def __init__(self, data_dir="", download_latest_data=True):
|
||||
if not os.path.exists(data_dir):
|
||||
print(
|
||||
"Creating a dynamic data directory to store COVID-19 "
|
||||
"policy tracking data: {}".format(data_dir)
|
||||
)
|
||||
os.makedirs(data_dir)
|
||||
|
||||
filename = "daily_us_policies.csv"
|
||||
if download_latest_data or filename not in os.listdir(data_dir):
|
||||
print(
|
||||
"Fetching latest U.S. COVID-19 policies data from OxCGRT, "
|
||||
"and saving it in {}".format(data_dir)
|
||||
)
|
||||
req = requests.get(
|
||||
"https://raw.githubusercontent.com/OxCGRT/USA-covid-policy/master/"
|
||||
"data/OxCGRT_US_latest.csv"
|
||||
)
|
||||
self.df = pd.read_csv(BytesIO(req.content), low_memory=False)
|
||||
self.df["Date"] = self.df["Date"].apply(
|
||||
lambda x: datetime.strptime(str(x), "%Y%m%d")
|
||||
)
|
||||
|
||||
# Fetch only the state-wide policies
|
||||
self.df = self.df.loc[self.df["Jurisdiction"] != "NAT_GOV"]
|
||||
|
||||
self.df.to_csv(
|
||||
os.path.join(data_dir, filename)
|
||||
) # Note: performs an overwrite
|
||||
else:
|
||||
print(
|
||||
"Not fetching the latest U.S. COVID-19 policies data from OxCGRT. "
|
||||
"Using whatever was saved earlier in {}!!".format(data_dir)
|
||||
)
|
||||
assert filename in os.listdir(data_dir)
|
||||
self.df = pd.read_csv(os.path.join(data_dir, filename), low_memory=False)
|
||||
|
||||
def process_policy_data(
|
||||
self, stringency_policy_key="StringencyIndex", num_stringency_levels=10
|
||||
):
|
||||
"""
|
||||
Gather the relevant policy indicator frm the dataframe,
|
||||
fill in the null values (if any),
|
||||
and discretize/quantize the policy into num_stringency_levels.
|
||||
Note: Possible values for stringency_policy_key are
|
||||
["StringencyIndex", "Government response index",
|
||||
"Containment and health index", "Economic Support index".]
|
||||
Reference: https://github.com/OxCGRT/covid-policy-tracker/blob/master/
|
||||
documentation/index_methodology.md
|
||||
"""
|
||||
|
||||
def discretize(policies, num_indicator_levels=10):
|
||||
"""
|
||||
Discretize the policies (a Pandas series) into num_indicator_levels
|
||||
"""
|
||||
# Indices are normalized to be in [0, 100]
|
||||
bins = np.linspace(0, 100, num_indicator_levels)
|
||||
# Find left and right values of bin and find the nearer edge
|
||||
bin_index = np.digitize(policies, bins, right=True)
|
||||
bin_left_edges = bins[bin_index - 1]
|
||||
bin_right_edges = bins[bin_index]
|
||||
discretized_policies = bin_index + np.argmin(
|
||||
np.stack(
|
||||
(
|
||||
np.abs(policies.values - bin_left_edges),
|
||||
np.abs(policies.values - bin_right_edges),
|
||||
)
|
||||
),
|
||||
axis=0,
|
||||
)
|
||||
return discretized_policies
|
||||
|
||||
# Gather just the relevant columns
|
||||
policy_df = self.df[["RegionName", "Date", stringency_policy_key]].copy()
|
||||
|
||||
# Fill in null values via a "forward fill"
|
||||
policy_df[stringency_policy_key].fillna(method="ffill", inplace=True)
|
||||
|
||||
# Discretize the stringency indices
|
||||
discretized_stringency_policies = discretize(
|
||||
policy_df[stringency_policy_key], num_indicator_levels=num_stringency_levels
|
||||
)
|
||||
policy_df.loc[:, stringency_policy_key] = discretized_stringency_policies
|
||||
|
||||
# Replace Washington DC by District of Columbia to keep consistent
|
||||
# (with the other data sources)
|
||||
policy_df = policy_df.replace("Washington DC", "District of Columbia")
|
||||
|
||||
policy_df = policy_df.sort_values(by=["RegionName", "Date"])
|
||||
|
||||
return policy_df
|
||||
Reference in New Issue
Block a user