In [1]:

              %load_ext autoreload
%autoreload 2

import sys
from datetime import timedelta, datetime
from pathlib import Path

import arviz as az
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pymc3 as pm
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

            

WARNING (theano.tensor.blas): Using NumPy C-API based implementation for BLAS functions.

In [2]:

              %config InlineBackend.figure_format = 'retina'
az.style.use("arviz-darkgrid")
sns.set_palette("deep")

            

In [3]:

              PROJECT_ROOT = Path.cwd().parent.resolve()
sys.path.append(str(PROJECT_ROOT))

In [4]:

              from src.data.make_dataset import make_cases_training_data, make_train_test_split, get_prior_distr_params

            

In [5]:

              data = make_cases_training_data()

            

In [6]:

              data.info()

            

<class 'pandas.core.frame.DataFrame'>
Int64Index: 66 entries, 0 to 65
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   notification_date       66 non-null     datetime64[ns]
 1   Daily Number of Cases   66 non-null     int64         
 2   Cumsum                  66 non-null     int64         
 3   Daily Difference        65 non-null     float64       
 4   Growth Factor           64 non-null     float64       
 5   Weekly Rolling Average  60 non-null     float64       
 6   Pct Change              59 non-null     float64       
 7   Weekly Average CumSum   60 non-null     float64       
 8   Epidemiological Days    66 non-null     float64       
dtypes: datetime64[ns](1), float64(6), int64(2)
memory usage: 7.2 KB

In [7]:

data

Out[7]:

	notification_date	Daily Number of Cases	Cumsum	Daily Difference	Growth Factor	Weekly Rolling Average	Pct Change	Weekly Average CumSum	Epidemiological Days
0	2021-06-16	3	3	NaN	NaN	NaN	NaN	NaN	-7.0
1	2021-06-17	1	4	-2.0	NaN	NaN	NaN	NaN	-6.0
2	2021-06-18	2	6	1.0	-0.500000	NaN	NaN	NaN	-5.0
3	2021-06-19	1	7	-1.0	-1.000000	NaN	NaN	NaN	-4.0
4	2021-06-20	2	9	1.0	-1.000000	NaN	NaN	NaN	-3.0
...	...	...	...	...	...	...	...	...	...
61	2021-08-16	567	8755	64.0	0.357542	421.0	0.073980	7359.0	54.0
62	2021-08-17	547	9302	-20.0	-0.312500	451.0	0.071259	7810.0	55.0
63	2021-08-18	649	9951	102.0	-5.100000	492.0	0.090909	8302.0	56.0
64	2021-08-19	771	10722	122.0	1.196078	545.0	0.107724	8847.0	57.0
65	2021-08-20	602	11324	-169.0	-1.385246	566.0	0.038532	9413.0	58.0

66 rows × 9 columns

In [8]:

              train_size = 0.8
target = "Weekly Average CumSum"
X_train, X_test, y_train, y_test = make_train_test_split(data, target, train_size=train_size)

            

In [9]:

              initial_number_of_cases, daily_number_of_cases_std, average_pct_change, std_pct_change = get_prior_distr_params(y_train)
print(f"Number of cases at the start of the time series: {initial_number_of_cases}")
print(f"Daily number of cases standard deviation: {daily_number_of_cases_std}")
print(f"Average percentage change: {average_pct_change}")
print(f"Standard deviation of percentage change: {std_pct_change}")

            

Number of cases at the start of the time series: 10.0
Daily number of cases standard deviation: 2360.0832266209677
Average percentage change: 16.906979990854005
Standard deviation of percentage change: 76.41209457164739

In [10]:

              # Create PyMC3 context manager
with pm.Model() as model:

    # Setup data for the model
    t = pm.Data("X", X_train)
    cases = pm.Data("y", y_train)

    # Intercept
    a = pm.Normal("a", mu=initial_number_of_cases, sigma=daily_number_of_cases_std)

    # Slope 
    b = pm.Normal("b", mu=average_pct_change, sigma=std_pct_change)

    # Exponential regression
    growth = a * (1 + b) ** t

    # Likelihood error
    eps = pm.HalfNormal("eps")

    # Likelihood - Counts here, so Poisson or negative binomial, but they causes issues.
    # Independence hypothesis is violated, due to transmission between people.
    # Lognormal tends to work better?
    pm.Lognormal("cases", mu=np.log(growth), sigma=eps, observed=cases)

    trace = pm.sample(1000, tune=3000, cores=5, return_inferencedata=True)
    post_Pred = pm.sample_posterior_predictive(trace)

            

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (5 chains in 5 jobs)
NUTS: [eps, b, a]

100.00% [20000/20000 00:13<00:00 Sampling 5 chains, 0 divergences]

Sampling 5 chains for 3_000 tune and 1_000 draw iterations (15_000 + 5_000 draws total) took 14 seconds.

100.00% [5000/5000 00:58<00:00]

In [11]:

                  # Update data reference.
pm.set_data({"X": X_test}, model=model)

# Generate posterior samples
post_pred_test = pm.sample_posterior_predictive(trace, model=model, samples=1000)

/home/vscode/.local/lib/python3.9/site-packages/pymc3/sampling.py:1689: UserWarning: samples parameter is smaller than nchains times ndraws, some draws and/or chains may not be represented in the returned posterior predictive sample
  warnings.warn(

100.00% [1000/1000 00:12<00:00]

In [12]:

                    y_pred = np.mean(post_pred_test["cases"], axis=0)
y_std = np.std(post_pred_test["cases"], axis=0)

In [13]:

                    def point_set_distance(y_pred: np.ndarray, y_pred_std: np.ndarray, y_true: np.ndarray
                        ) -> np.ndarray:
    """Compute the point-to-set distance between the predicted and true values.

    Args:
        y_pred (np.ndarray): Numpy array. Predicted value.
        y_true (np.ndarray): Numpy array. True value.
        y_pred_std (np.ndarray, optional): Standard deviation of the predicted value.

    Returns:
        np.ndarray: point-to-set distance: Numpy array.

    References:
        [1] https://math.stackexchange.com/questions/2099527/distance-between-a-point-and-a-set-or-closure-of-it
    """
    assert y_pred.shape == y_true.shape, "The shapes of the predicted and true values do not match."
    assert y_pred.shape == y_pred_std.shape, "The shapes of predicted and std values do not match."

    return np.where(y_pred - y_pred_std <= y_true <= y_pred + y_pred_std, 0
            ,min(abs(y_true - (y_pred + y_pred_std))
                ,abs(y_true - (y_pred - y_pred_std))
                )
            )


def mean_squared_relative_error(y_pred: np.ndarray, y_true: np.ndarray
                                ,distance: str=None, y_pred_std: np.ndarray=None
                                ) -> float:
    """Compute the mean squared relative error between the predicted and true values.

    Args:
        y_pred (np.ndarray): Numpy array. Predicted value.
        y_true (np.ndarray): Numpy array. True value.
        distance (str, optional): Distance metric to use. Defaults to None.
        y_pred_std (np.ndarray, optional): Standard deviation of the predicted value.

    Returns:
        float: mean squared relative error: Numpy array. Mean squared relative error between the predicted and true values.

    References:
        [1] https://stats.stackexchange.com/questions/413209/is-there-something-like-a-root-mean-square-relative-error-rmsre-or-what-is-t
    """
    if distance == "point to set":
        d = point_set_distance(y_pred, y_pred_std, y_true)

    else:
        d = y_pred - y_true

    return (d**2/y_true**2).mean()


def relative_mean_squared_error(y_pred: np.ndarray, y_true: np.ndarray
                                ,distance: str=None, y_pred_std: np.ndarray=None
                                ) -> float:
    """Compute reative mean squared error between the predicted and true values.

    Args:
        y_pred (np.ndarray): Numpy array. Predicted value.
        y_true (np.ndarray): Numpy array. True value.
        distance (str, optional): Distance metric to use. Defaults to None.
        y_pred_std (np.ndarray, optional): Standard deviation of the predicted value.

    Returns:
        float: Relative mean squared error between the predicted and true values.

    References:
        [1] https://stats.stackexchange.com/questions/413209/is-there-something-like-a-root-mean-square-relative-error-rmsre-or-what-is-t
    """

    if distance == "point to set":
        d = point_set_distance(y_pred, y_pred_std, y_true)

    else:
        d = y_pred - y_true

    return (d**2).mean()/(y_true**2).sum()

                  

In [14]:

                    r2 = r2_score(y_pred, y_test)

rmse = np.sqrt(mean_squared_error(y_pred, y_test))
rmsre = np.sqrt(mean_squared_relative_error(y_pred, y_test))
rrmse = np.sqrt(relative_mean_squared_error(y_pred, y_test))
mae = mean_absolute_error(y_pred, y_test)
rmae = (abs(y_pred - y_test) / y_test).mean()
print(f"R2 = {r2}")
print(f"RMSE = {rmse}, RMSRE = {rmsre}")
print(f"MAE = {mae}, RMAE = {rmae}")

                  

R2 = 0.6849049328486042
RMSE = 3526.921058068736, RMSRE = 0.9865672360267822
MAE = 2063.09464629102, RMAE = 0.593140083841747

In [15]:

                    fig, ax = plt.subplots(figsize=(20, 10))
ax = sns.scatterplot(x=X_test.squeeze(), y=(y_pred - y_test).squeeze(), marker="o", ax=ax)
ax.set(ylabel="Residual", xlabel="Days Since 10 Cases", title="Residual Plot");

                  

In [16]:

                    az.plot_trace(trace)
plt.show()

In [17]:

                    pm.summary(trace).round(3)

                  

Out[17]:

	mean	sd	hdi_3%	hdi_97%	mcse_mean	mcse_sd	ess_bulk	ess_tail	r_hat
a	65.205	8.167	51.099	81.515	0.167	0.119	2397.0	2024.0	1.0
b	0.100	0.004	0.092	0.108	0.000	0.000	2363.0	2272.0	1.0
eps	0.430	0.047	0.348	0.518	0.001	0.001	2432.0	2183.0	1.0

In [18]:

                    # Update data reference
mask = data['Epidemiological Days'] >= 0
X = data.loc[mask, 'Epidemiological Days'].values.squeeze().reshape(-1, 1)
pm.set_data({"X": X.flatten()}, model=model)

# Generate posterior samples
post_pred = pm.sample_posterior_predictive(trace, model=model, samples=1000)

                  

/home/vscode/.local/lib/python3.9/site-packages/pymc3/sampling.py:1689: UserWarning: samples parameter is smaller than nchains times ndraws, some draws and/or chains may not be represented in the returned posterior predictive sample
  warnings.warn(

100.00% [1000/1000 00:13<00:00]

In [19]:

                      fig, ax = plt.subplots(figsize=(20, 10))
ax.plot(X, post_pred["cases"].T, color="k", alpha=0.05)
ax.plot(X, data.loc[mask, target], color="r", label="Actual")
ax.set(xlabel="Days Since 10 Cases", ylabel=target, title="Cumulative Rolling Average")
ax.legend()

                    

Out[19]:

<matplotlib.legend.Legend at 0x7fed632fa520>

In [20]:

                      # Augment epidemioligical days
forecast_ndays = 7
X_forecast = np.append(X, list(np.arange(X.max() + 1, X.max() + forecast_ndays + 1))).reshape(-1, 1)

# Update data reference.
pm.set_data({"X": X_forecast.flatten()}, model=model)

# Generate posterior samples
post_pred_forecast = pm.sample_posterior_predictive(trace, model=model, samples=1000)

                    

/home/vscode/.local/lib/python3.9/site-packages/pymc3/sampling.py:1689: UserWarning: samples parameter is smaller than nchains times ndraws, some draws and/or chains may not be represented in the returned posterior predictive sample
  warnings.warn(

100.00% [1000/1000 00:12<00:00]

In [21]:

                        mask = data["Epidemiological Days"] >= 0
data_forecast = pd.DataFrame({"Epidemiological Days": X_forecast.flatten()
                            ,target: data.loc[mask, target].append(pd.Series([np.nan]*forecast_ndays))
                            # ,"Daily Number of Cases": data.loc[mask, "Daily Number of Cases"].append(pd.Series([np.nan]*forecast_ndays))
                            ,"Predicted Weekly Average CumSum": np.mean(post_pred_forecast["cases"], axis=0).flatten()
                            ,"notification_date": data.loc[mask, "notification_date"].append(pd.Series([data["notification_date"].max() + timedelta(days=day) for day in range(1, forecast_ndays+1)]))
                            ,"Predicted Cum Weekly Std": np.std(post_pred_forecast["cases"], axis=0).flatten()
                            })
data_forecast.reset_index(inplace=True, drop=True)
data_forecast["Prediction Lower Bound"] = data_forecast["Predicted Weekly Average CumSum"] - 2*data_forecast["Predicted Cum Weekly Std"]
data_forecast["Prediction Upper Bound"] = data_forecast["Predicted Weekly Average CumSum"] + 2*data_forecast["Predicted Cum Weekly Std"]
data_forecast["Forecast_On_Date"] = datetime.now().date()
data_forecast[["Predicted Weekly Average CumSum", "Prediction Lower Bound", "Prediction Upper Bound", "Forecast_On_Date", "notification_date"]].to_csv(str(PROJECT_ROOT / "data" / "processed" / f"predicted_weekly_rolling_average_{datetime.now().date()}.csv"), index=False)
data_forecast

                      

Out[21]:

	Epidemiological Days	Weekly Average CumSum	Predicted Weekly Average CumSum	notification_date	Predicted Cum Weekly Std	Prediction Lower Bound	Prediction Upper Bound	Forecast_On_Date
0	0.0	10.0	69.696767	2021-06-23	31.201721	7.293324	132.100209	2021-08-22
1	1.0	19.0	80.685246	2021-06-24	40.516835	-0.348425	161.718917	2021-08-22
2	2.0	31.0	83.639161	2021-06-25	37.957055	7.725052	159.553270	2021-08-22
3	3.0	47.0	95.105675	2021-06-26	45.403931	4.297814	185.913536	2021-08-22
4	4.0	65.0	104.708095	2021-06-27	49.149855	6.408385	203.007804	2021-08-22
...	...	...	...	...	...	...	...	...
61	61.0	NaN	24340.885055	2021-08-23	11594.175844	1152.533367	47529.236742	2021-08-22
62	62.0	NaN	26845.874473	2021-08-24	12355.187511	2135.499450	51556.249495	2021-08-22
63	63.0	NaN	29931.777992	2021-08-25	14329.609802	1272.558387	58590.997597	2021-08-22
64	64.0	NaN	32499.844518	2021-08-26	15069.594744	2360.655030	62639.034007	2021-08-22
65	65.0	NaN	35815.473604	2021-08-27	18790.128850	-1764.784095	73395.731303	2021-08-22

66 rows × 8 columns

In [32]:

                        data_forecast[[target, "Predicted Weekly Average CumSum"]]

                      

Out[32]:

	Weekly Average CumSum	Predicted Weekly Average CumSum
0	10.0	69.696767
1	19.0	80.685246
2	31.0	83.639161
3	47.0	95.105675
4	65.0	104.708095
...	...	...
61	NaN	24340.885055
62	NaN	26845.874473
63	NaN	29931.777992
64	NaN	32499.844518
65	NaN	35815.473604

66 rows × 2 columns

In [22]:

                        data_forecast["Predicted Weekly Average CumSum"].diff()

                      

Out[22]:

0             NaN
1       10.988479
2        2.953915
3       11.466514
4        9.602420
         ...     
61    1892.610174
62    2504.989418
63    3085.903519
64    2568.066526
65    3315.629086
Name: Predicted Weekly Average CumSum, Length: 66, dtype: float64

In [41]:

                        forecast_std = np.std(data_forecast["Predicted Weekly Average CumSum"].diff()).flatten()
x_axis=data_forecast["Epidemiological Days"].values

fig, ax = plt.subplots(figsize=(20, 10))
ax = sns.lineplot(x=x_axis, y=data_forecast["Predicted Weekly Average CumSum"], color="r", label="Actual Rolling Average", marker="o", ax=ax)
ax = sns.lineplot(x=x_axis, y=data_forecast[target], color="k", label="Daily Number of Cases", marker="o", ax=ax)
# ax = sns.lineplot(x=x_axis, y=np.mean(post_pred_forecast["cases"], axis=0).flatten(), color="b", marker="o", label="Predicted Rolling Average", linestyle="--", ax=ax)
# ax = sns.lineplot(x=x_axis, y=data_forecast["Predicted Weekly Average CumSum"].diff() + forecast_std.flatten(), color="b", linestyle=":", label="Prediction Uncertainty", ax=ax)
# ax = sns.lineplot(x=x_axis, y=data_forecast["Predicted Weekly Average CumSum"].diff() - forecast_std.flatten(), color="b", linestyle=":", ax=ax)


plt.xticks(data_forecast["Epidemiological Days"], data_forecast["notification_date"].astype(str), rotation = 45, ha="right")

ax.set(xlabel="Day", ylabel="Number of Cases", title="Number of Cases Since 10 Cases")
ax.legend()
ax.grid(True)

                      

In [24]:

                        %load_ext watermark
%watermark -n -u -v -iv -w

Last updated: Sun Aug 22 2021

Python implementation: CPython
Python version       : 3.9.2
IPython version      : 7.26.0

seaborn   : 0.11.2
numpy     : 1.21.2
sys       : 3.9.2 (default, Mar 12 2021, 19:04:51) 
[GCC 8.3.0]
pandas    : 1.3.2
pymc3     : 3.11.2
arviz     : 0.11.2
matplotlib: 3.4.3

Watermark: 2.2.0

In [ ]:

Humberto STEIN SHIROMOTO

Forecasting Seven Days Average Number of COVID19 Cases

Import¶

Modules¶

Scripts¶

Data¶

Audit¶

Split Data¶

Model Training¶

Train Model¶

Model Evaluation¶

Model Inference¶

Predict on Whole Data Set¶

Seven Days Forecast¶

Comments

You May Also Enjoy

Leveraging Generative AI for Effective Risk Management Part II: Actionable Strategies for Businesses

Leveraging Generative AI for Effective Risk Management Part I: Introduction to Business Risk

The Traveling Salesman Problem

Are Model Performance Metrics Enough?