SARIMAX: Model selection, missing data¶
The example mirrors Durbin and Koopman (2012), Chapter 8.4 in application of Box-Jenkins methodology to fit ARMA models. The novel feature is the ability of the model to work on datasets with missing values.
[1]:
%matplotlib inline
[2]:
import numpy as np
import pandas as pd
from scipy.stats import norm
import statsmodels.api as sm
import matplotlib.pyplot as plt
[3]:
import requests
from io import BytesIO
from zipfile import ZipFile
# Download the dataset
dk = requests.get('http://www.ssfpack.com/files/DK-data.zip').content
f = BytesIO(dk)
zipped = ZipFile(f)
df = pd.read_table(
BytesIO(zipped.read('internet.dat')),
skiprows=1, header=None, sep='\s+', engine='python',
names=['internet','dinternet']
)
---------------------------------------------------------------------------
ConnectionRefusedError Traceback (most recent call last)
/usr/lib/python3/dist-packages/urllib3/connection.py in _new_conn(self)
157 try:
--> 158 conn = connection.create_connection(
159 (self._dns_host, self.port), self.timeout, **extra_kw)
/usr/lib/python3/dist-packages/urllib3/util/connection.py in create_connection(address, timeout, source_address, socket_options)
79 if err is not None:
---> 80 raise err
81
/usr/lib/python3/dist-packages/urllib3/util/connection.py in create_connection(address, timeout, source_address, socket_options)
69 sock.bind(source_address)
---> 70 sock.connect(sa)
71 return sock
ConnectionRefusedError: [Errno 111] Connection refused
During handling of the above exception, another exception occurred:
NewConnectionError Traceback (most recent call last)
/usr/lib/python3/dist-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
596 # Make the request on the httplib connection object.
--> 597 httplib_response = self._make_request(conn, method, url,
598 timeout=timeout_obj,
/usr/lib/python3/dist-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
353 else:
--> 354 conn.request(method, url, **httplib_request_kw)
355
/usr/lib/python3.8/http/client.py in request(self, method, url, body, headers, encode_chunked)
1229 """Send a complete request to the server."""
-> 1230 self._send_request(method, url, body, headers, encode_chunked)
1231
/usr/lib/python3.8/http/client.py in _send_request(self, method, url, body, headers, encode_chunked)
1275 body = _encode(body, 'body')
-> 1276 self.endheaders(body, encode_chunked=encode_chunked)
1277
/usr/lib/python3.8/http/client.py in endheaders(self, message_body, encode_chunked)
1224 raise CannotSendHeader()
-> 1225 self._send_output(message_body, encode_chunked=encode_chunked)
1226
/usr/lib/python3.8/http/client.py in _send_output(self, message_body, encode_chunked)
1003 del self._buffer[:]
-> 1004 self.send(msg)
1005
/usr/lib/python3.8/http/client.py in send(self, data)
943 if self.auto_open:
--> 944 self.connect()
945 else:
/usr/lib/python3/dist-packages/urllib3/connection.py in connect(self)
180 def connect(self):
--> 181 conn = self._new_conn()
182 self._prepare_conn(conn)
/usr/lib/python3/dist-packages/urllib3/connection.py in _new_conn(self)
166 except SocketError as e:
--> 167 raise NewConnectionError(
168 self, "Failed to establish a new connection: %s" % e)
NewConnectionError: <urllib3.connection.HTTPConnection object at 0x7efca32c90a0>: Failed to establish a new connection: [Errno 111] Connection refused
During handling of the above exception, another exception occurred:
MaxRetryError Traceback (most recent call last)
/usr/lib/python3/dist-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
438 if not chunked:
--> 439 resp = conn.urlopen(
440 method=request.method,
/usr/lib/python3/dist-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
636
--> 637 retries = retries.increment(method, url, error=e, _pool=self,
638 _stacktrace=sys.exc_info()[2])
/usr/lib/python3/dist-packages/urllib3/util/retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
397 if new_retry.is_exhausted():
--> 398 raise MaxRetryError(_pool, url, error or ResponseError(cause))
399
MaxRetryError: HTTPConnectionPool(host='127.0.0.1', port=9): Max retries exceeded with url: http://www.ssfpack.com/files/DK-data.zip (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7efca32c90a0>: Failed to establish a new connection: [Errno 111] Connection refused')))
During handling of the above exception, another exception occurred:
ProxyError Traceback (most recent call last)
<ipython-input-3-074aec8a1161> in <module>
4
5 # Download the dataset
----> 6 dk = requests.get('http://www.ssfpack.com/files/DK-data.zip').content
7 f = BytesIO(dk)
8 zipped = ZipFile(f)
/usr/lib/python3/dist-packages/requests/api.py in get(url, params, **kwargs)
73
74 kwargs.setdefault('allow_redirects', True)
---> 75 return request('get', url, params=params, **kwargs)
76
77
/usr/lib/python3/dist-packages/requests/api.py in request(method, url, **kwargs)
58 # cases, and look like a memory leak in others.
59 with sessions.Session() as session:
---> 60 return session.request(method=method, url=url, **kwargs)
61
62
/usr/lib/python3/dist-packages/requests/sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
531 }
532 send_kwargs.update(settings)
--> 533 resp = self.send(prep, **send_kwargs)
534
535 return resp
/usr/lib/python3/dist-packages/requests/sessions.py in send(self, request, **kwargs)
644
645 # Send the request
--> 646 r = adapter.send(request, **kwargs)
647
648 # Total elapsed time of the request (approximately)
/usr/lib/python3/dist-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
508
509 if isinstance(e.reason, _ProxyError):
--> 510 raise ProxyError(e, request=request)
511
512 if isinstance(e.reason, _SSLError):
ProxyError: HTTPConnectionPool(host='127.0.0.1', port=9): Max retries exceeded with url: http://www.ssfpack.com/files/DK-data.zip (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7efca32c90a0>: Failed to establish a new connection: [Errno 111] Connection refused')))
Model Selection¶
As in Durbin and Koopman, we force a number of the values to be missing.
[4]:
# Get the basic series
dta_full = df.dinternet[1:].values
dta_miss = dta_full.copy()
# Remove datapoints
missing = np.r_[6,16,26,36,46,56,66,72,73,74,75,76,86,96]-1
dta_miss[missing] = np.nan
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-4-70c0b0b5593e> in <module>
1 # Get the basic series
----> 2 dta_full = df.dinternet[1:].values
3 dta_miss = dta_full.copy()
4
5 # Remove datapoints
NameError: name 'df' is not defined
Then we can consider model selection using the Akaike information criteria (AIC), but running the model for each variant and selecting the model with the lowest AIC value.
There are a couple of things to note here:
- When running such a large batch of models, particularly when the autoregressive and moving average orders become large, there is the possibility of poor maximum likelihood convergence. Below we ignore the warnings since this example is illustrative.
- We use the option
enforce_invertibility=False
, which allows the moving average polynomial to be non-invertible, so that more of the models are estimable. - Several of the models do not produce good results, and their AIC value is set to NaN. This is not surprising, as Durbin and Koopman note numerical problems with the high order models.
[5]:
import warnings
aic_full = pd.DataFrame(np.zeros((6,6), dtype=float))
aic_miss = pd.DataFrame(np.zeros((6,6), dtype=float))
warnings.simplefilter('ignore')
# Iterate over all ARMA(p,q) models with p,q in [0,6]
for p in range(6):
for q in range(6):
if p == 0 and q == 0:
continue
# Estimate the model with no missing datapoints
mod = sm.tsa.statespace.SARIMAX(dta_full, order=(p,0,q), enforce_invertibility=False)
try:
res = mod.fit(disp=False)
aic_full.iloc[p,q] = res.aic
except:
aic_full.iloc[p,q] = np.nan
# Estimate the model with missing datapoints
mod = sm.tsa.statespace.SARIMAX(dta_miss, order=(p,0,q), enforce_invertibility=False)
try:
res = mod.fit(disp=False)
aic_miss.iloc[p,q] = res.aic
except:
aic_miss.iloc[p,q] = np.nan
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-5-bcbba42f81f2> in <module>
13
14 # Estimate the model with no missing datapoints
---> 15 mod = sm.tsa.statespace.SARIMAX(dta_full, order=(p,0,q), enforce_invertibility=False)
16 try:
17 res = mod.fit(disp=False)
NameError: name 'dta_full' is not defined
For the models estimated over the full (non-missing) dataset, the AIC chooses ARMA(1,1) or ARMA(3,0). Durbin and Koopman suggest the ARMA(1,1) specification is better due to parsimony.
For the models estimated over missing dataset, the AIC chooses ARMA(1,1)
Note: the AIC values are calculated differently than in Durbin and Koopman, but show overall similar trends.
Postestimation¶
Using the ARMA(1,1) specification selected above, we perform in-sample prediction and out-of-sample forecasting.
[6]:
# Statespace
mod = sm.tsa.statespace.SARIMAX(dta_miss, order=(1,0,1))
res = mod.fit(disp=False)
print(res.summary())
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-6-8dd911607628> in <module>
1 # Statespace
----> 2 mod = sm.tsa.statespace.SARIMAX(dta_miss, order=(1,0,1))
3 res = mod.fit(disp=False)
4 print(res.summary())
NameError: name 'dta_miss' is not defined
[7]:
# In-sample one-step-ahead predictions, and out-of-sample forecasts
nforecast = 20
predict = res.get_prediction(end=mod.nobs + nforecast)
idx = np.arange(len(predict.predicted_mean))
predict_ci = predict.conf_int(alpha=0.5)
# Graph
fig, ax = plt.subplots(figsize=(12,6))
ax.xaxis.grid()
ax.plot(dta_miss, 'k.')
# Plot
ax.plot(idx[:-nforecast], predict.predicted_mean[:-nforecast], 'gray')
ax.plot(idx[-nforecast:], predict.predicted_mean[-nforecast:], 'k--', linestyle='--', linewidth=2)
ax.fill_between(idx, predict_ci[:, 0], predict_ci[:, 1], alpha=0.15)
ax.set(title='Figure 8.9 - Internet series');
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-7-394a7033843a> in <module>
1 # In-sample one-step-ahead predictions, and out-of-sample forecasts
2 nforecast = 20
----> 3 predict = res.get_prediction(end=mod.nobs + nforecast)
4 idx = np.arange(len(predict.predicted_mean))
5 predict_ci = predict.conf_int(alpha=0.5)
NameError: name 'res' is not defined