Forecasting Seven Days Average Number of COVID19 Cases
Updated:
Import¶
Modules¶
In [1]:
%load_ext autoreload
%autoreload 2
import sys
from datetime import timedelta, datetime
from pathlib import Path
import arviz as az
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pymc3 as pm
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
WARNING (theano.tensor.blas): Using NumPy C-API based implementation for BLAS functions.
Setup figure plot settings
In [2]:
%config InlineBackend.figure_format = 'retina'
az.style.use("arviz-darkgrid")
sns.set_palette("deep")
Setup paths to import scripts
In [3]:
PROJECT_ROOT = Path.cwd().parent.resolve()
sys.path.append(str(PROJECT_ROOT))
Scripts¶
In [4]:
from src.data.make_dataset import make_cases_training_data, make_train_test_split, get_prior_distr_params
Data¶
In [5]:
data = make_cases_training_data()
Audit¶
In [6]:
data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 66 entries, 0 to 65 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 notification_date 66 non-null datetime64[ns] 1 Daily Number of Cases 66 non-null int64 2 Cumsum 66 non-null int64 3 Daily Difference 65 non-null float64 4 Growth Factor 64 non-null float64 5 Weekly Rolling Average 60 non-null float64 6 Pct Change 59 non-null float64 7 Weekly Average CumSum 60 non-null float64 8 Epidemiological Days 66 non-null float64 dtypes: datetime64[ns](1), float64(6), int64(2) memory usage: 7.2 KB
In [7]:
data
Out[7]:
notification_date | Daily Number of Cases | Cumsum | Daily Difference | Growth Factor | Weekly Rolling Average | Pct Change | Weekly Average CumSum | Epidemiological Days | |
---|---|---|---|---|---|---|---|---|---|
0 | 2021-06-16 | 3 | 3 | NaN | NaN | NaN | NaN | NaN | -7.0 |
1 | 2021-06-17 | 1 | 4 | -2.0 | NaN | NaN | NaN | NaN | -6.0 |
2 | 2021-06-18 | 2 | 6 | 1.0 | -0.500000 | NaN | NaN | NaN | -5.0 |
3 | 2021-06-19 | 1 | 7 | -1.0 | -1.000000 | NaN | NaN | NaN | -4.0 |
4 | 2021-06-20 | 2 | 9 | 1.0 | -1.000000 | NaN | NaN | NaN | -3.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
61 | 2021-08-16 | 567 | 8755 | 64.0 | 0.357542 | 421.0 | 0.073980 | 7359.0 | 54.0 |
62 | 2021-08-17 | 547 | 9302 | -20.0 | -0.312500 | 451.0 | 0.071259 | 7810.0 | 55.0 |
63 | 2021-08-18 | 649 | 9951 | 102.0 | -5.100000 | 492.0 | 0.090909 | 8302.0 | 56.0 |
64 | 2021-08-19 | 771 | 10722 | 122.0 | 1.196078 | 545.0 | 0.107724 | 8847.0 | 57.0 |
65 | 2021-08-20 | 602 | 11324 | -169.0 | -1.385246 | 566.0 | 0.038532 | 9413.0 | 58.0 |
66 rows × 9 columns
Split Data¶
In [8]:
train_size = 0.8
target = "Weekly Average CumSum"
X_train, X_test, y_train, y_test = make_train_test_split(data, target, train_size=train_size)
Get prior distribution parameters
In [9]:
initial_number_of_cases, daily_number_of_cases_std, average_pct_change, std_pct_change = get_prior_distr_params(y_train)
print(f"Number of cases at the start of the time series: {initial_number_of_cases}")
print(f"Daily number of cases standard deviation: {daily_number_of_cases_std}")
print(f"Average percentage change: {average_pct_change}")
print(f"Standard deviation of percentage change: {std_pct_change}")
Number of cases at the start of the time series: 10.0 Daily number of cases standard deviation: 2360.0832266209677 Average percentage change: 16.906979990854005 Standard deviation of percentage change: 76.41209457164739
Model Training¶
Train Model¶
In [10]:
# Create PyMC3 context manager
with pm.Model() as model:
# Setup data for the model
t = pm.Data("X", X_train)
cases = pm.Data("y", y_train)
# Intercept
a = pm.Normal("a", mu=initial_number_of_cases, sigma=daily_number_of_cases_std)
# Slope
b = pm.Normal("b", mu=average_pct_change, sigma=std_pct_change)
# Exponential regression
growth = a * (1 + b) ** t
# Likelihood error
eps = pm.HalfNormal("eps")
# Likelihood - Counts here, so Poisson or negative binomial, but they causes issues.
# Independence hypothesis is violated, due to transmission between people.
# Lognormal tends to work better?
pm.Lognormal("cases", mu=np.log(growth), sigma=eps, observed=cases)
trace = pm.sample(1000, tune=3000, cores=5, return_inferencedata=True)
post_Pred = pm.sample_posterior_predictive(trace)
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (5 chains in 5 jobs) NUTS: [eps, b, a]
100.00% [20000/20000 00:13<00:00 Sampling 5 chains, 0 divergences]
Sampling 5 chains for 3_000 tune and 1_000 draw iterations (15_000 + 5_000 draws total) took 14 seconds.
100.00% [5000/5000 00:58<00:00]
Model Evaluation¶
In [11]:
# Update data reference.
pm.set_data({"X": X_test}, model=model)
# Generate posterior samples
post_pred_test = pm.sample_posterior_predictive(trace, model=model, samples=1000)
/home/vscode/.local/lib/python3.9/site-packages/pymc3/sampling.py:1689: UserWarning: samples parameter is smaller than nchains times ndraws, some draws and/or chains may not be represented in the returned posterior predictive sample warnings.warn(
100.00% [1000/1000 00:12<00:00]
Get test prediction datam
In [12]:
y_pred = np.mean(post_pred_test["cases"], axis=0)
y_std = np.std(post_pred_test["cases"], axis=0)
Define custom metrics for model performance evaluation
In [13]:
def point_set_distance(y_pred: np.ndarray, y_pred_std: np.ndarray, y_true: np.ndarray
) -> np.ndarray:
"""Compute the point-to-set distance between the predicted and true values.
Args:
y_pred (np.ndarray): Numpy array. Predicted value.
y_true (np.ndarray): Numpy array. True value.
y_pred_std (np.ndarray, optional): Standard deviation of the predicted value.
Returns:
np.ndarray: point-to-set distance: Numpy array.
References:
[1] https://math.stackexchange.com/questions/2099527/distance-between-a-point-and-a-set-or-closure-of-it
"""
assert y_pred.shape == y_true.shape, "The shapes of the predicted and true values do not match."
assert y_pred.shape == y_pred_std.shape, "The shapes of predicted and std values do not match."
return np.where(y_pred - y_pred_std <= y_true <= y_pred + y_pred_std, 0
,min(abs(y_true - (y_pred + y_pred_std))
,abs(y_true - (y_pred - y_pred_std))
)
)
def mean_squared_relative_error(y_pred: np.ndarray, y_true: np.ndarray
,distance: str=None, y_pred_std: np.ndarray=None
) -> float:
"""Compute the mean squared relative error between the predicted and true values.
Args:
y_pred (np.ndarray): Numpy array. Predicted value.
y_true (np.ndarray): Numpy array. True value.
distance (str, optional): Distance metric to use. Defaults to None.
y_pred_std (np.ndarray, optional): Standard deviation of the predicted value.
Returns:
float: mean squared relative error: Numpy array. Mean squared relative error between the predicted and true values.
References:
[1] https://stats.stackexchange.com/questions/413209/is-there-something-like-a-root-mean-square-relative-error-rmsre-or-what-is-t
"""
if distance == "point to set":
d = point_set_distance(y_pred, y_pred_std, y_true)
else:
d = y_pred - y_true
return (d**2/y_true**2).mean()
def relative_mean_squared_error(y_pred: np.ndarray, y_true: np.ndarray
,distance: str=None, y_pred_std: np.ndarray=None
) -> float:
"""Compute reative mean squared error between the predicted and true values.
Args:
y_pred (np.ndarray): Numpy array. Predicted value.
y_true (np.ndarray): Numpy array. True value.
distance (str, optional): Distance metric to use. Defaults to None.
y_pred_std (np.ndarray, optional): Standard deviation of the predicted value.
Returns:
float: Relative mean squared error between the predicted and true values.
References:
[1] https://stats.stackexchange.com/questions/413209/is-there-something-like-a-root-mean-square-relative-error-rmsre-or-what-is-t
"""
if distance == "point to set":
d = point_set_distance(y_pred, y_pred_std, y_true)
else:
d = y_pred - y_true
return (d**2).mean()/(y_true**2).sum()
Calculate performance metrics
In [14]:
r2 = r2_score(y_pred, y_test)
rmse = np.sqrt(mean_squared_error(y_pred, y_test))
rmsre = np.sqrt(mean_squared_relative_error(y_pred, y_test))
rrmse = np.sqrt(relative_mean_squared_error(y_pred, y_test))
mae = mean_absolute_error(y_pred, y_test)
rmae = (abs(y_pred - y_test) / y_test).mean()
print(f"R2 = {r2}")
print(f"RMSE = {rmse}, RMSRE = {rmsre}")
print(f"MAE = {mae}, RMAE = {rmae}")
R2 = 0.6849049328486042 RMSE = 3526.921058068736, RMSRE = 0.9865672360267822 MAE = 2063.09464629102, RMAE = 0.593140083841747
Plot residuals
In [15]:
fig, ax = plt.subplots(figsize=(20, 10))
ax = sns.scatterplot(x=X_test.squeeze(), y=(y_pred - y_test).squeeze(), marker="o", ax=ax)
ax.set(ylabel="Residual", xlabel="Days Since 10 Cases", title="Residual Plot");
Model Inference¶
Distribution of model parameters
In [16]:
az.plot_trace(trace)
plt.show()