Forecasting Seven Days Average Number of COVID19 Cases
Updated:
Import¶
Modules¶
%load_ext autoreload
%autoreload 2
import sys
from datetime import timedelta, datetime
from pathlib import Path
import arviz as az
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pymc3 as pm
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
WARNING (theano.tensor.blas): Using NumPy C-API based implementation for BLAS functions.
Setup figure plot settings
%config InlineBackend.figure_format = 'retina'
az.style.use("arviz-darkgrid")
sns.set_palette("deep")
Setup paths to import scripts
PROJECT_ROOT = Path.cwd().parent.resolve()
sys.path.append(str(PROJECT_ROOT))
Scripts¶
from src.data.make_dataset import make_cases_training_data, make_train_test_split, get_prior_distr_params
Data¶
data = make_cases_training_data()
Audit¶
data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 66 entries, 0 to 65 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 notification_date 66 non-null datetime64[ns] 1 Daily Number of Cases 66 non-null int64 2 Cumsum 66 non-null int64 3 Daily Difference 65 non-null float64 4 Growth Factor 64 non-null float64 5 Weekly Rolling Average 60 non-null float64 6 Pct Change 59 non-null float64 7 Weekly Average CumSum 60 non-null float64 8 Epidemiological Days 66 non-null float64 dtypes: datetime64[ns](1), float64(6), int64(2) memory usage: 7.2 KB
data
notification_date | Daily Number of Cases | Cumsum | Daily Difference | Growth Factor | Weekly Rolling Average | Pct Change | Weekly Average CumSum | Epidemiological Days | |
---|---|---|---|---|---|---|---|---|---|
0 | 2021-06-16 | 3 | 3 | NaN | NaN | NaN | NaN | NaN | -7.0 |
1 | 2021-06-17 | 1 | 4 | -2.0 | NaN | NaN | NaN | NaN | -6.0 |
2 | 2021-06-18 | 2 | 6 | 1.0 | -0.500000 | NaN | NaN | NaN | -5.0 |
3 | 2021-06-19 | 1 | 7 | -1.0 | -1.000000 | NaN | NaN | NaN | -4.0 |
4 | 2021-06-20 | 2 | 9 | 1.0 | -1.000000 | NaN | NaN | NaN | -3.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
61 | 2021-08-16 | 567 | 8755 | 64.0 | 0.357542 | 421.0 | 0.073980 | 7359.0 | 54.0 |
62 | 2021-08-17 | 547 | 9302 | -20.0 | -0.312500 | 451.0 | 0.071259 | 7810.0 | 55.0 |
63 | 2021-08-18 | 649 | 9951 | 102.0 | -5.100000 | 492.0 | 0.090909 | 8302.0 | 56.0 |
64 | 2021-08-19 | 771 | 10722 | 122.0 | 1.196078 | 545.0 | 0.107724 | 8847.0 | 57.0 |
65 | 2021-08-20 | 602 | 11324 | -169.0 | -1.385246 | 566.0 | 0.038532 | 9413.0 | 58.0 |
66 rows × 9 columns
Split Data¶
train_size = 0.8
target = "Weekly Average CumSum"
X_train, X_test, y_train, y_test = make_train_test_split(data, target, train_size=train_size)
Get prior distribution parameters
initial_number_of_cases, daily_number_of_cases_std, average_pct_change, std_pct_change = get_prior_distr_params(y_train)
print(f"Number of cases at the start of the time series: {initial_number_of_cases}")
print(f"Daily number of cases standard deviation: {daily_number_of_cases_std}")
print(f"Average percentage change: {average_pct_change}")
print(f"Standard deviation of percentage change: {std_pct_change}")
Number of cases at the start of the time series: 10.0 Daily number of cases standard deviation: 2360.0832266209677 Average percentage change: 16.906979990854005 Standard deviation of percentage change: 76.41209457164739
Model Training¶
Train Model¶
# Create PyMC3 context manager
with pm.Model() as model:
# Setup data for the model
t = pm.Data("X", X_train)
cases = pm.Data("y", y_train)
# Intercept
a = pm.Normal("a", mu=initial_number_of_cases, sigma=daily_number_of_cases_std)
# Slope
b = pm.Normal("b", mu=average_pct_change, sigma=std_pct_change)
# Exponential regression
growth = a * (1 + b) ** t
# Likelihood error
eps = pm.HalfNormal("eps")
# Likelihood - Counts here, so Poisson or negative binomial, but they causes issues.
# Independence hypothesis is violated, due to transmission between people.
# Lognormal tends to work better?
pm.Lognormal("cases", mu=np.log(growth), sigma=eps, observed=cases)
trace = pm.sample(1000, tune=3000, cores=5, return_inferencedata=True)
post_Pred = pm.sample_posterior_predictive(trace)
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (5 chains in 5 jobs) NUTS: [eps, b, a]
Sampling 5 chains for 3_000 tune and 1_000 draw iterations (15_000 + 5_000 draws total) took 14 seconds.
Model Evaluation¶
# Update data reference.
pm.set_data({"X": X_test}, model=model)
# Generate posterior samples
post_pred_test = pm.sample_posterior_predictive(trace, model=model, samples=1000)
/home/vscode/.local/lib/python3.9/site-packages/pymc3/sampling.py:1689: UserWarning: samples parameter is smaller than nchains times ndraws, some draws and/or chains may not be represented in the returned posterior predictive sample warnings.warn(
Get test prediction datam
y_pred = np.mean(post_pred_test["cases"], axis=0)
y_std = np.std(post_pred_test["cases"], axis=0)
Define custom metrics for model performance evaluation
def point_set_distance(y_pred: np.ndarray, y_pred_std: np.ndarray, y_true: np.ndarray
) -> np.ndarray:
"""Compute the point-to-set distance between the predicted and true values.
Args:
y_pred (np.ndarray): Numpy array. Predicted value.
y_true (np.ndarray): Numpy array. True value.
y_pred_std (np.ndarray, optional): Standard deviation of the predicted value.
Returns:
np.ndarray: point-to-set distance: Numpy array.
References:
[1] https://math.stackexchange.com/questions/2099527/distance-between-a-point-and-a-set-or-closure-of-it
"""
assert y_pred.shape == y_true.shape, "The shapes of the predicted and true values do not match."
assert y_pred.shape == y_pred_std.shape, "The shapes of predicted and std values do not match."
return np.where(y_pred - y_pred_std <= y_true <= y_pred + y_pred_std, 0
,min(abs(y_true - (y_pred + y_pred_std))
,abs(y_true - (y_pred - y_pred_std))
)
)
def mean_squared_relative_error(y_pred: np.ndarray, y_true: np.ndarray
,distance: str=None, y_pred_std: np.ndarray=None
) -> float:
"""Compute the mean squared relative error between the predicted and true values.
Args:
y_pred (np.ndarray): Numpy array. Predicted value.
y_true (np.ndarray): Numpy array. True value.
distance (str, optional): Distance metric to use. Defaults to None.
y_pred_std (np.ndarray, optional): Standard deviation of the predicted value.
Returns:
float: mean squared relative error: Numpy array. Mean squared relative error between the predicted and true values.
References:
[1] https://stats.stackexchange.com/questions/413209/is-there-something-like-a-root-mean-square-relative-error-rmsre-or-what-is-t
"""
if distance == "point to set":
d = point_set_distance(y_pred, y_pred_std, y_true)
else:
d = y_pred - y_true
return (d**2/y_true**2).mean()
def relative_mean_squared_error(y_pred: np.ndarray, y_true: np.ndarray
,distance: str=None, y_pred_std: np.ndarray=None
) -> float:
"""Compute reative mean squared error between the predicted and true values.
Args:
y_pred (np.ndarray): Numpy array. Predicted value.
y_true (np.ndarray): Numpy array. True value.
distance (str, optional): Distance metric to use. Defaults to None.
y_pred_std (np.ndarray, optional): Standard deviation of the predicted value.
Returns:
float: Relative mean squared error between the predicted and true values.
References:
[1] https://stats.stackexchange.com/questions/413209/is-there-something-like-a-root-mean-square-relative-error-rmsre-or-what-is-t
"""
if distance == "point to set":
d = point_set_distance(y_pred, y_pred_std, y_true)
else:
d = y_pred - y_true
return (d**2).mean()/(y_true**2).sum()
Calculate performance metrics
r2 = r2_score(y_pred, y_test)
rmse = np.sqrt(mean_squared_error(y_pred, y_test))
rmsre = np.sqrt(mean_squared_relative_error(y_pred, y_test))
rrmse = np.sqrt(relative_mean_squared_error(y_pred, y_test))
mae = mean_absolute_error(y_pred, y_test)
rmae = (abs(y_pred - y_test) / y_test).mean()
print(f"R2 = {r2}")
print(f"RMSE = {rmse}, RMSRE = {rmsre}")
print(f"MAE = {mae}, RMAE = {rmae}")
R2 = 0.6849049328486042 RMSE = 3526.921058068736, RMSRE = 0.9865672360267822 MAE = 2063.09464629102, RMAE = 0.593140083841747
Plot residuals
fig, ax = plt.subplots(figsize=(20, 10))
ax = sns.scatterplot(x=X_test.squeeze(), y=(y_pred - y_test).squeeze(), marker="o", ax=ax)
ax.set(ylabel="Residual", xlabel="Days Since 10 Cases", title="Residual Plot");
Model Inference¶
Distribution of model parameters
az.plot_trace(trace)
plt.show()
Summary of model parameters
pm.summary(trace).round(3)
mean | sd | hdi_3% | hdi_97% | mcse_mean | mcse_sd | ess_bulk | ess_tail | r_hat | |
---|---|---|---|---|---|---|---|---|---|
a | 65.205 | 8.167 | 51.099 | 81.515 | 0.167 | 0.119 | 2397.0 | 2024.0 | 1.0 |
b | 0.100 | 0.004 | 0.092 | 0.108 | 0.000 | 0.000 | 2363.0 | 2272.0 | 1.0 |
eps | 0.430 | 0.047 | 0.348 | 0.518 | 0.001 | 0.001 | 2432.0 | 2183.0 | 1.0 |
Predict on Whole Data Set¶
# Update data reference
mask = data['Epidemiological Days'] >= 0
X = data.loc[mask, 'Epidemiological Days'].values.squeeze().reshape(-1, 1)
pm.set_data({"X": X.flatten()}, model=model)
# Generate posterior samples
post_pred = pm.sample_posterior_predictive(trace, model=model, samples=1000)
/home/vscode/.local/lib/python3.9/site-packages/pymc3/sampling.py:1689: UserWarning: samples parameter is smaller than nchains times ndraws, some draws and/or chains may not be represented in the returned posterior predictive sample warnings.warn(
Plot model posterior predictive and actual weekly rolling average for positive values of epidemiological days
fig, ax = plt.subplots(figsize=(20, 10))
ax.plot(X, post_pred["cases"].T, color="k", alpha=0.05)
ax.plot(X, data.loc[mask, target], color="r", label="Actual")
ax.set(xlabel="Days Since 10 Cases", ylabel=target, title="Cumulative Rolling Average")
ax.legend()
<matplotlib.legend.Legend at 0x7fed632fa520>
Seven Days Forecast¶
# Augment epidemioligical days
forecast_ndays = 7
X_forecast = np.append(X, list(np.arange(X.max() + 1, X.max() + forecast_ndays + 1))).reshape(-1, 1)
# Update data reference.
pm.set_data({"X": X_forecast.flatten()}, model=model)
# Generate posterior samples
post_pred_forecast = pm.sample_posterior_predictive(trace, model=model, samples=1000)
/home/vscode/.local/lib/python3.9/site-packages/pymc3/sampling.py:1689: UserWarning: samples parameter is smaller than nchains times ndraws, some draws and/or chains may not be represented in the returned posterior predictive sample warnings.warn(
Create data set containing forecast data
mask = data["Epidemiological Days"] >= 0
data_forecast = pd.DataFrame({"Epidemiological Days": X_forecast.flatten()
,target: data.loc[mask, target].append(pd.Series([np.nan]*forecast_ndays))
# ,"Daily Number of Cases": data.loc[mask, "Daily Number of Cases"].append(pd.Series([np.nan]*forecast_ndays))
,"Predicted Weekly Average CumSum": np.mean(post_pred_forecast["cases"], axis=0).flatten()
,"notification_date": data.loc[mask, "notification_date"].append(pd.Series([data["notification_date"].max() + timedelta(days=day) for day in range(1, forecast_ndays+1)]))
,"Predicted Cum Weekly Std": np.std(post_pred_forecast["cases"], axis=0).flatten()
})
data_forecast.reset_index(inplace=True, drop=True)
data_forecast["Prediction Lower Bound"] = data_forecast["Predicted Weekly Average CumSum"] - 2*data_forecast["Predicted Cum Weekly Std"]
data_forecast["Prediction Upper Bound"] = data_forecast["Predicted Weekly Average CumSum"] + 2*data_forecast["Predicted Cum Weekly Std"]
data_forecast["Forecast_On_Date"] = datetime.now().date()
data_forecast[["Predicted Weekly Average CumSum", "Prediction Lower Bound", "Prediction Upper Bound", "Forecast_On_Date", "notification_date"]].to_csv(str(PROJECT_ROOT / "data" / "processed" / f"predicted_weekly_rolling_average_{datetime.now().date()}.csv"), index=False)
data_forecast
Epidemiological Days | Weekly Average CumSum | Predicted Weekly Average CumSum | notification_date | Predicted Cum Weekly Std | Prediction Lower Bound | Prediction Upper Bound | Forecast_On_Date | |
---|---|---|---|---|---|---|---|---|
0 | 0.0 | 10.0 | 69.696767 | 2021-06-23 | 31.201721 | 7.293324 | 132.100209 | 2021-08-22 |
1 | 1.0 | 19.0 | 80.685246 | 2021-06-24 | 40.516835 | -0.348425 | 161.718917 | 2021-08-22 |
2 | 2.0 | 31.0 | 83.639161 | 2021-06-25 | 37.957055 | 7.725052 | 159.553270 | 2021-08-22 |
3 | 3.0 | 47.0 | 95.105675 | 2021-06-26 | 45.403931 | 4.297814 | 185.913536 | 2021-08-22 |
4 | 4.0 | 65.0 | 104.708095 | 2021-06-27 | 49.149855 | 6.408385 | 203.007804 | 2021-08-22 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
61 | 61.0 | NaN | 24340.885055 | 2021-08-23 | 11594.175844 | 1152.533367 | 47529.236742 | 2021-08-22 |
62 | 62.0 | NaN | 26845.874473 | 2021-08-24 | 12355.187511 | 2135.499450 | 51556.249495 | 2021-08-22 |
63 | 63.0 | NaN | 29931.777992 | 2021-08-25 | 14329.609802 | 1272.558387 | 58590.997597 | 2021-08-22 |
64 | 64.0 | NaN | 32499.844518 | 2021-08-26 | 15069.594744 | 2360.655030 | 62639.034007 | 2021-08-22 |
65 | 65.0 | NaN | 35815.473604 | 2021-08-27 | 18790.128850 | -1764.784095 | 73395.731303 | 2021-08-22 |
66 rows × 8 columns
data_forecast[[target, "Predicted Weekly Average CumSum"]]
Weekly Average CumSum | Predicted Weekly Average CumSum | |
---|---|---|
0 | 10.0 | 69.696767 |
1 | 19.0 | 80.685246 |
2 | 31.0 | 83.639161 |
3 | 47.0 | 95.105675 |
4 | 65.0 | 104.708095 |
... | ... | ... |
61 | NaN | 24340.885055 |
62 | NaN | 26845.874473 |
63 | NaN | 29931.777992 |
64 | NaN | 32499.844518 |
65 | NaN | 35815.473604 |
66 rows × 2 columns
data_forecast["Predicted Weekly Average CumSum"].diff()
0 NaN 1 10.988479 2 2.953915 3 11.466514 4 9.602420 ... 61 1892.610174 62 2504.989418 63 3085.903519 64 2568.066526 65 3315.629086 Name: Predicted Weekly Average CumSum, Length: 66, dtype: float64
Plot actual and forecasted values
forecast_std = np.std(data_forecast["Predicted Weekly Average CumSum"].diff()).flatten()
x_axis=data_forecast["Epidemiological Days"].values
fig, ax = plt.subplots(figsize=(20, 10))
ax = sns.lineplot(x=x_axis, y=data_forecast["Predicted Weekly Average CumSum"], color="r", label="Actual Rolling Average", marker="o", ax=ax)
ax = sns.lineplot(x=x_axis, y=data_forecast[target], color="k", label="Daily Number of Cases", marker="o", ax=ax)
# ax = sns.lineplot(x=x_axis, y=np.mean(post_pred_forecast["cases"], axis=0).flatten(), color="b", marker="o", label="Predicted Rolling Average", linestyle="--", ax=ax)
# ax = sns.lineplot(x=x_axis, y=data_forecast["Predicted Weekly Average CumSum"].diff() + forecast_std.flatten(), color="b", linestyle=":", label="Prediction Uncertainty", ax=ax)
# ax = sns.lineplot(x=x_axis, y=data_forecast["Predicted Weekly Average CumSum"].diff() - forecast_std.flatten(), color="b", linestyle=":", ax=ax)
plt.xticks(data_forecast["Epidemiological Days"], data_forecast["notification_date"].astype(str), rotation = 45, ha="right")
ax.set(xlabel="Day", ylabel="Number of Cases", title="Number of Cases Since 10 Cases")
ax.legend()
ax.grid(True)
%load_ext watermark
%watermark -n -u -v -iv -w
Last updated: Sun Aug 22 2021 Python implementation: CPython Python version : 3.9.2 IPython version : 7.26.0 seaborn : 0.11.2 numpy : 1.21.2 sys : 3.9.2 (default, Mar 12 2021, 19:04:51) [GCC 8.3.0] pandas : 1.3.2 pymc3 : 3.11.2 arviz : 0.11.2 matplotlib: 3.4.3 Watermark: 2.2.0
Comments