SARIMAX: Model selection, missing data

The example mirrors Durbin and Koopman (2012), Chapter 8.4 in application of Box-Jenkins methodology to fit ARMA models. The novel feature is the ability of the model to work on datasets with missing values.

In [1]:
%matplotlib inline
In [2]:
import numpy as np
import pandas as pd
from scipy.stats import norm
import statsmodels.api as sm
import matplotlib.pyplot as plt
/build/statsmodels-JytjB9/statsmodels-0.8.0/.pybuild/cpython3_3.6_statsmodels/build/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools
In [3]:
import requests
from io import BytesIO
from zipfile import ZipFile

# Download the dataset
dk = requests.get('http://www.ssfpack.com/files/DK-data.zip').content
f = BytesIO(dk)
zipped = ZipFile(f)
df = pd.read_table(
    BytesIO(zipped.read('internet.dat')),
    skiprows=1, header=None, sep='\s+', engine='python',
    names=['internet','dinternet']
)
---------------------------------------------------------------------------
ConnectionRefusedError                    Traceback (most recent call last)
/usr/lib/python3/dist-packages/urllib3/connection.py in _new_conn(self)
    140             conn = connection.create_connection(
--> 141                 (self.host, self.port), self.timeout, **extra_kw)
    142 

/usr/lib/python3/dist-packages/urllib3/util/connection.py in create_connection(address, timeout, source_address, socket_options)
     82     if err is not None:
---> 83         raise err
     84 

/usr/lib/python3/dist-packages/urllib3/util/connection.py in create_connection(address, timeout, source_address, socket_options)
     72                 sock.bind(source_address)
---> 73             sock.connect(sa)
     74             return sock

ConnectionRefusedError: [Errno 111] Connection refused

During handling of the above exception, another exception occurred:

NewConnectionError                        Traceback (most recent call last)
/usr/lib/python3/dist-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
    600                                                   body=body, headers=headers,
--> 601                                                   chunked=chunked)
    602 

/usr/lib/python3/dist-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    356         else:
--> 357             conn.request(method, url, **httplib_request_kw)
    358 

/usr/lib/python3.6/http/client.py in request(self, method, url, body, headers, encode_chunked)
   1238         """Send a complete request to the server."""
-> 1239         self._send_request(method, url, body, headers, encode_chunked)
   1240 

/usr/lib/python3.6/http/client.py in _send_request(self, method, url, body, headers, encode_chunked)
   1284             body = _encode(body, 'body')
-> 1285         self.endheaders(body, encode_chunked=encode_chunked)
   1286 

/usr/lib/python3.6/http/client.py in endheaders(self, message_body, encode_chunked)
   1233             raise CannotSendHeader()
-> 1234         self._send_output(message_body, encode_chunked=encode_chunked)
   1235 

/usr/lib/python3.6/http/client.py in _send_output(self, message_body, encode_chunked)
   1025         del self._buffer[:]
-> 1026         self.send(msg)
   1027 

/usr/lib/python3.6/http/client.py in send(self, data)
    963             if self.auto_open:
--> 964                 self.connect()
    965             else:

/usr/lib/python3/dist-packages/urllib3/connection.py in connect(self)
    165     def connect(self):
--> 166         conn = self._new_conn()
    167         self._prepare_conn(conn)

/usr/lib/python3/dist-packages/urllib3/connection.py in _new_conn(self)
    149             raise NewConnectionError(
--> 150                 self, "Failed to establish a new connection: %s" % e)
    151 

NewConnectionError: <urllib3.connection.HTTPConnection object at 0x7f2f1c0ec2e8>: Failed to establish a new connection: [Errno 111] Connection refused

During handling of the above exception, another exception occurred:

MaxRetryError                             Traceback (most recent call last)
/usr/lib/python3/dist-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
    439                     retries=self.max_retries,
--> 440                     timeout=timeout
    441                 )

/usr/lib/python3/dist-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
    638             retries = retries.increment(method, url, error=e, _pool=self,
--> 639                                         _stacktrace=sys.exc_info()[2])
    640             retries.sleep()

/usr/lib/python3/dist-packages/urllib3/util/retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
    387         if new_retry.is_exhausted():
--> 388             raise MaxRetryError(_pool, url, error or ResponseError(cause))
    389 

MaxRetryError: HTTPConnectionPool(host='127.0.0.1', port=9): Max retries exceeded with url: http://www.ssfpack.com/files/DK-data.zip (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f2f1c0ec2e8>: Failed to establish a new connection: [Errno 111] Connection refused',)))

During handling of the above exception, another exception occurred:

ProxyError                                Traceback (most recent call last)
<ipython-input-3-074aec8a1161> in <module>()
      4 
      5 # Download the dataset
----> 6 dk = requests.get('http://www.ssfpack.com/files/DK-data.zip').content
      7 f = BytesIO(dk)
      8 zipped = ZipFile(f)

/usr/lib/python3/dist-packages/requests/api.py in get(url, params, **kwargs)
     70 
     71     kwargs.setdefault('allow_redirects', True)
---> 72     return request('get', url, params=params, **kwargs)
     73 
     74 

/usr/lib/python3/dist-packages/requests/api.py in request(method, url, **kwargs)
     56     # cases, and look like a memory leak in others.
     57     with sessions.Session() as session:
---> 58         return session.request(method=method, url=url, **kwargs)
     59 
     60 

/usr/lib/python3/dist-packages/requests/sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
    506         }
    507         send_kwargs.update(settings)
--> 508         resp = self.send(prep, **send_kwargs)
    509 
    510         return resp

/usr/lib/python3/dist-packages/requests/sessions.py in send(self, request, **kwargs)
    616 
    617         # Send the request
--> 618         r = adapter.send(request, **kwargs)
    619 
    620         # Total elapsed time of the request (approximately)

/usr/lib/python3/dist-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
    500 
    501             if isinstance(e.reason, _ProxyError):
--> 502                 raise ProxyError(e, request=request)
    503 
    504             if isinstance(e.reason, _SSLError):

ProxyError: HTTPConnectionPool(host='127.0.0.1', port=9): Max retries exceeded with url: http://www.ssfpack.com/files/DK-data.zip (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f2f1c0ec2e8>: Failed to establish a new connection: [Errno 111] Connection refused',)))

Model Selection

As in Durbin and Koopman, we force a number of the values to be missing.

In [4]:
# Get the basic series
dta_full = df.dinternet[1:].values
dta_miss = dta_full.copy()

# Remove datapoints
missing = np.r_[6,16,26,36,46,56,66,72,73,74,75,76,86,96]-1
dta_miss[missing] = np.nan
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-4-70c0b0b5593e> in <module>()
      1 # Get the basic series
----> 2 dta_full = df.dinternet[1:].values
      3 dta_miss = dta_full.copy()
      4 
      5 # Remove datapoints

NameError: name 'df' is not defined

Then we can consider model selection using the Akaike information criteria (AIC), but running the model for each variant and selecting the model with the lowest AIC value.

There are a couple of things to note here:

  • When running such a large batch of models, particularly when the autoregressive and moving average orders become large, there is the possibility of poor maximum likelihood convergence. Below we ignore the warnings since this example is illustrative.
  • We use the option enforce_invertibility=False, which allows the moving average polynomial to be non-invertible, so that more of the models are estimable.
  • Several of the models do not produce good results, and their AIC value is set to NaN. This is not surprising, as Durbin and Koopman note numerical problems with the high order models.
In [5]:
import warnings

aic_full = pd.DataFrame(np.zeros((6,6), dtype=float))
aic_miss = pd.DataFrame(np.zeros((6,6), dtype=float))

warnings.simplefilter('ignore')

# Iterate over all ARMA(p,q) models with p,q in [0,6]
for p in range(6):
    for q in range(6):
        if p == 0 and q == 0:
            continue
            
        # Estimate the model with no missing datapoints
        mod = sm.tsa.statespace.SARIMAX(dta_full, order=(p,0,q), enforce_invertibility=False)
        try:
            res = mod.fit()
            aic_full.iloc[p,q] = res.aic
        except:
            aic_full.iloc[p,q] = np.nan
        
        # Estimate the model with missing datapoints
        mod = sm.tsa.statespace.SARIMAX(dta_miss, order=(p,0,q), enforce_invertibility=False)
        try:
            res = mod.fit()
            aic_miss.iloc[p,q] = res.aic
        except:
            aic_miss.iloc[p,q] = np.nan
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-5-735b63dc22a3> in <module>()
     13 
     14         # Estimate the model with no missing datapoints
---> 15         mod = sm.tsa.statespace.SARIMAX(dta_full, order=(p,0,q), enforce_invertibility=False)
     16         try:
     17             res = mod.fit()

NameError: name 'dta_full' is not defined

For the models estimated over the full (non-missing) dataset, the AIC chooses ARMA(1,1) or ARMA(3,0). Durbin and Koopman suggest the ARMA(1,1) specification is better due to parsimony.

$$ \text{Replication of:}\\ \textbf{Table 8.1} ~~ \text{AIC for different ARMA models.}\\ \newcommand{\r}[1]{{\color{red}{#1}}} \begin{array}{lrrrrrr} \hline q & 0 & 1 & 2 & 3 & 4 & 5 \\ \hline p & {} & {} & {} & {} & {} & {} \\ 0 & 0.00 & 549.81 & 519.87 & 520.27 & 519.38 & 518.86 \\ 1 & 529.24 & \r{514.30} & 516.25 & 514.58 & 515.10 & 516.28 \\ 2 & 522.18 & 516.29 & 517.16 & 515.77 & 513.24 & 514.73 \\ 3 & \r{511.99} & 513.94 & 515.92 & 512.06 & 513.72 & 514.50 \\ 4 & 513.93 & 512.89 & nan & nan & 514.81 & 516.08 \\ 5 & 515.86 & 517.64 & nan & nan & nan & nan \\ \hline \end{array} $$

For the models estimated over missing dataset, the AIC chooses ARMA(1,1)

$$ \text{Replication of:}\\ \textbf{Table 8.2} ~~ \text{AIC for different ARMA models with missing observations.}\\ \begin{array}{lrrrrrr} \hline q & 0 & 1 & 2 & 3 & 4 & 5 \\ \hline p & {} & {} & {} & {} & {} & {} \\ 0 & 0.00 & 488.93 & 464.01 & 463.86 & 462.63 & 463.62 \\ 1 & 468.01 & \r{457.54} & 459.35 & 458.66 & 459.15 & 461.01 \\ 2 & 469.68 & nan & 460.48 & 459.43 & 459.23 & 460.47 \\ 3 & 467.10 & 458.44 & 459.64 & 456.66 & 459.54 & 460.05 \\ 4 & 469.00 & 459.52 & nan & 463.04 & 459.35 & 460.96 \\ 5 & 471.32 & 461.26 & nan & nan & 461.00 & 462.97 \\ \hline \end{array} $$

Note: the AIC values are calculated differently than in Durbin and Koopman, but show overall similar trends.

Postestimation

Using the ARMA(1,1) specification selected above, we perform in-sample prediction and out-of-sample forecasting.

In [6]:
# Statespace
mod = sm.tsa.statespace.SARIMAX(dta_miss, order=(1,0,1))
res = mod.fit()
print(res.summary())
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-6-dd0a0a728f6a> in <module>()
      1 # Statespace
----> 2 mod = sm.tsa.statespace.SARIMAX(dta_miss, order=(1,0,1))
      3 res = mod.fit()
      4 print(res.summary())

NameError: name 'dta_miss' is not defined
In [7]:
# In-sample one-step-ahead predictions, and out-of-sample forecasts
nforecast = 20
predict = res.get_prediction(end=mod.nobs + nforecast)
idx = np.arange(len(predict.predicted_mean))
predict_ci = predict.conf_int(alpha=0.5)

# Graph
fig, ax = plt.subplots(figsize=(12,6))
ax.xaxis.grid()
ax.plot(dta_miss, 'k.')

# Plot
ax.plot(idx[:-nforecast], predict.predicted_mean[:-nforecast], 'gray')
ax.plot(idx[-nforecast:], predict.predicted_mean[-nforecast:], 'k--', linestyle='--', linewidth=2)
ax.fill_between(idx, predict_ci.iloc[:, 0], predict_ci.iloc[:, 1], alpha=0.15)

ax.set(title='Figure 8.9 - Internet series');
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-7-c5a0278f6f27> in <module>()
      1 # In-sample one-step-ahead predictions, and out-of-sample forecasts
      2 nforecast = 20
----> 3 predict = res.get_prediction(end=mod.nobs + nforecast)
      4 idx = np.arange(len(predict.predicted_mean))
      5 predict_ci = predict.conf_int(alpha=0.5)

NameError: name 'res' is not defined