Skip to content

Electricity modelling with a BAM

This example includes fitting a big data GAM (BAM) to UK electricity demand data, taken from the national grid. The dataset contains:

  • NetDemand – net electricity demand
  • wM – instantaneous temperature, averaged over several cities
  • wM_s95 – exponential smooth of wM
  • Posan – periodic index in \([0, 1]\) indicating the position along the year
  • Dow – factor variable indicating the day of the week
  • Trend – progressive counter, useful for defining the long-term trend
  • Instant - The hour \([0, 23]\)
  • NetDemand.24 – 24 hour lagged version of NetDemand
  • Year - the year
import matplotlib.pyplot as plt
import pandas as pd

import pymgcv.plot as gplt
from pymgcv.terms import L, S, T
from pymgcv.utils import load_rdata_dataframe_from_url

data = load_rdata_dataframe_from_url(
    "https://github.com/mfasiolo/testGam/raw/master/data/gefcom_big.rda",
)
data.columns = [c.lower().replace(".", "_") for c in data.columns]
data["dow"] = data["dow"].astype("category")
pd.plotting.scatter_matrix(data.sample(n=250), s=20, figsize=(9,9))
plt.tight_layout()

img

from pymgcv.basis_functions import CubicSpline
from pymgcv.gam import BAM

gam1 = BAM(
    {
    "netdemand":  (
        L("dow") +   # Encoded as category
        S("netdemand_24", bs=CubicSpline()) +
        S("trend", k=6, bs=CubicSpline()) +
        S("wm", bs=CubicSpline()) +
        S("instant", bs=CubicSpline()) +
        S("wm_s95", bs=CubicSpline()) +
        S("posan", bs=CubicSpline(cyclic=True), k=20)
    ), 
    }, 
 )


gam1.fit(data, discrete=True, n_threads=1) 
gplt.plot(gam1)
plt.show()

img

Check whether the bases are large enough

gam1.check_k()
term  max_edf        edf   k_index  p_value
0  s(netdemand_24)      9.0   8.371362  0.977012   0.0375
1         s(trend)      5.0   4.938627  0.728228   0.0000
2            s(wm)      9.0   8.961725  1.021851   0.9450
3       s(instant)      9.0   8.989825  0.990962   0.2750
4        s(wm_s95)      9.0   8.867216  1.015348   0.8500
5         s(posan)     18.0  17.049568  0.830697   0.0000

The p-values for s(trend) and s(posan) are very low:

  • Raising k for s(posan) may help.
  • Raising k for s(trend) may help, but for time components like this, increasing k to much can lead to overfitting. It might be better to try improving the model in other ways (e.g. autoregressive components).
  • The smooths of instant, wm_s95 and Instant have EDF values close to the maximum, so it might help to increase these too.
gam2 = BAM({
    "netdemand":  (
        L("dow") +   # Encoded as category
        S("netdemand_24", k=20, bs=CubicSpline()) +
        S("trend", k=6, bs=CubicSpline()) +
        S("wm", k=20, bs=CubicSpline()) +
        S("instant", k=20, bs=CubicSpline()) +
        S("wm_s95", k=20, bs=CubicSpline()) +
        S("posan", k=30, bs=CubicSpline(cyclic=True))
    ), 
    })
gam2.fit(data, discrete=True)
gam2.check_k()
term  max_edf        edf   k_index  p_value
0  s(netdemand_24)     19.0  11.775060  1.007657   0.6925
1         s(trend)      5.0   4.939703  0.720882   0.0000
2            s(wm)     19.0  17.350784  0.986407   0.1475
3       s(instant)     19.0  18.420370  1.005573   0.6575
4        s(wm_s95)     19.0  15.928629  0.991724   0.3175
5         s(posan)     28.0  25.160270  0.847709   0.0000
  • The EDF values are now all somewhat less than the maximum (except from trend, but this is OK).
  • We can check if interaction terms might be useful, for example with instant (the temperature), by looking at the residuals.
residuals = gam2.residuals()
plot_vars = ["netdemand_24", "wm", "wm_s95", "posan"]
fig, axes = plt.subplots(nrows=2, ncols=2, sharey=True, layout="constrained")

for var, ax in zip(plot_vars, axes.flatten(), strict=False):
    gplt.hexbin_residuals(
        residuals,
        var,
        "instant",
        data=data,
        ax=ax,
        )
fig.colorbar(axes[0,0].collections[0], ax=axes);

img

It's clear that we should include interaction terms! Because the variables are on different scales, we should use T. We can include them as interaction only terms, and keep the original effects.

interaction_terms = (
    T("netdemand_24", "instant", k=(15, 15), interaction_only=True) +
    T("wm", "instant", k=(10, 10), interaction_only=True) +
    T("wm_s95", "instant", k=(15, 15), interaction_only=True) +
    T("posan", "instant", k=(10, 15), interaction_only=True)
)
# Take previous models predictors and extend the model
predictors = gam2.predictors.copy()
predictors["netdemand"].extend(interaction_terms)
gam3 = BAM(predictors)
gam3.fit(data, discrete=True)
gam3.check_k()
term  max_edf         edf   k_index  p_value
0           s(netdemand_24)     19.0   13.360622  0.971776   0.0250
1                  s(trend)      5.0    4.975395  0.650465   0.0000
2                     s(wm)     19.0   17.582837  1.011076   0.7675
3                s(instant)     19.0   18.547421  1.035581   0.9950
4                 s(wm_s95)     19.0   17.492170  0.974010   0.0375
5                  s(posan)     28.0   26.707128  0.906030   0.0000
6  ti(netdemand_24,instant)    196.0  103.933439  1.019063   0.9350
7            ti(wm,instant)     81.0   54.844276  1.000472   0.4950
8        ti(wm_s95,instant)    196.0  110.628848  1.007131   0.6500
9         ti(posan,instant)    126.0  113.587174  0.937806   0.0000
residuals = gam3.residuals()
fig, axes = plt.subplots(nrows=2, ncols=2, sharey=True, layout="constrained")

for var, ax in zip(plot_vars, axes.flatten(), strict=False):
    gplt.hexbin_residuals(residuals, var, "instant", data=data, ax=ax)

fig.colorbar(axes[0, 0].collections[0], ax=axes, orientation="vertical", label="Residuals");

img

Much better! If we check the AIC, it also suggests a better fit

for name, m in {"Univariate smooths": gam2, "Interaction": gam3}.items():
    print(f"{name}: {m.aic()}")
Univariate smooths: 460416.5326344241
Interaction: 431496.1108823907
# The smooth and categorical term:
fig, axes = gplt.plot(gam3, to_plot=S | L, ncols=4)

img

  • We might be concerned with posan - it's quite wiggly! Maybe it's an issue with holidays.
# The interaction terms - we'll just show a subset of the datapoints
fig, axes = gplt.plot(
    gam3,
    to_plot=T,
    scatter=True,
    data=data.sample(n=400),
    )

img