Regression Plots

In [1]:
%matplotlib inline

from __future__ import print_function
from statsmodels.compat import lzip
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols
/build/statsmodels-JytjB9/statsmodels-0.8.0/.pybuild/cpython3_3.6_statsmodels/build/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools

Duncan's Prestige Dataset

Load the Data

We can use a utility function to load any R dataset available from the great Rdatasets package.

In [2]:
prestige = sm.datasets.get_rdataset("Duncan", "car", cache=True).data
---------------------------------------------------------------------------
gaierror                                  Traceback (most recent call last)
/usr/lib/python3.6/urllib/request.py in do_open(self, http_class, req, **http_conn_args)
   1317                 h.request(req.get_method(), req.selector, req.data, headers,
-> 1318                           encode_chunked=req.has_header('Transfer-encoding'))
   1319             except OSError as err: # timeout error

/usr/lib/python3.6/http/client.py in request(self, method, url, body, headers, encode_chunked)
   1238         """Send a complete request to the server."""
-> 1239         self._send_request(method, url, body, headers, encode_chunked)
   1240 

/usr/lib/python3.6/http/client.py in _send_request(self, method, url, body, headers, encode_chunked)
   1284             body = _encode(body, 'body')
-> 1285         self.endheaders(body, encode_chunked=encode_chunked)
   1286 

/usr/lib/python3.6/http/client.py in endheaders(self, message_body, encode_chunked)
   1233             raise CannotSendHeader()
-> 1234         self._send_output(message_body, encode_chunked=encode_chunked)
   1235 

/usr/lib/python3.6/http/client.py in _send_output(self, message_body, encode_chunked)
   1025         del self._buffer[:]
-> 1026         self.send(msg)
   1027 

/usr/lib/python3.6/http/client.py in send(self, data)
    963             if self.auto_open:
--> 964                 self.connect()
    965             else:

/usr/lib/python3.6/http/client.py in connect(self)
   1391 
-> 1392             super().connect()
   1393 

/usr/lib/python3.6/http/client.py in connect(self)
    935         self.sock = self._create_connection(
--> 936             (self.host,self.port), self.timeout, self.source_address)
    937         self.sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)

/usr/lib/python3.6/socket.py in create_connection(address, timeout, source_address)
    703     err = None
--> 704     for res in getaddrinfo(host, port, 0, SOCK_STREAM):
    705         af, socktype, proto, canonname, sa = res

/usr/lib/python3.6/socket.py in getaddrinfo(host, port, family, type, proto, flags)
    744     addrlist = []
--> 745     for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
    746         af, socktype, proto, canonname, sa = res

gaierror: [Errno -2] Name or service not known

During handling of the above exception, another exception occurred:

URLError                                  Traceback (most recent call last)
<ipython-input-2-b71db8dae743> in <module>()
----> 1 prestige = sm.datasets.get_rdataset("Duncan", "car", cache=True).data

/build/statsmodels-JytjB9/statsmodels-0.8.0/.pybuild/cpython3_3.6_statsmodels/build/statsmodels/datasets/utils.py in get_rdataset(dataname, package, cache)
    288                      "master/doc/"+package+"/rst/")
    289     cache = _get_cache(cache)
--> 290     data, from_cache = _get_data(data_base_url, dataname, cache)
    291     data = read_csv(data, index_col=0)
    292     data = _maybe_reset_index(data)

/build/statsmodels-JytjB9/statsmodels-0.8.0/.pybuild/cpython3_3.6_statsmodels/build/statsmodels/datasets/utils.py in _get_data(base_url, dataname, cache, extension)
    219     url = base_url + (dataname + ".%s") % extension
    220     try:
--> 221         data, from_cache = _urlopen_cached(url, cache)
    222     except HTTPError as err:
    223         if '404' in str(err):

/build/statsmodels-JytjB9/statsmodels-0.8.0/.pybuild/cpython3_3.6_statsmodels/build/statsmodels/datasets/utils.py in _urlopen_cached(url, cache)
    210     # not using the cache or didn't find it in cache
    211     if not from_cache:
--> 212         data = urlopen(url).read()
    213         if cache is not None:  # then put it in the cache
    214             _cache_it(data, cache_path)

/usr/lib/python3.6/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    221     else:
    222         opener = _opener
--> 223     return opener.open(url, data, timeout)
    224 
    225 def install_opener(opener):

/usr/lib/python3.6/urllib/request.py in open(self, fullurl, data, timeout)
    524             req = meth(req)
    525 
--> 526         response = self._open(req, data)
    527 
    528         # post-process response

/usr/lib/python3.6/urllib/request.py in _open(self, req, data)
    542         protocol = req.type
    543         result = self._call_chain(self.handle_open, protocol, protocol +
--> 544                                   '_open', req)
    545         if result:
    546             return result

/usr/lib/python3.6/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
    502         for handler in handlers:
    503             func = getattr(handler, meth_name)
--> 504             result = func(*args)
    505             if result is not None:
    506                 return result

/usr/lib/python3.6/urllib/request.py in https_open(self, req)
   1359         def https_open(self, req):
   1360             return self.do_open(http.client.HTTPSConnection, req,
-> 1361                 context=self._context, check_hostname=self._check_hostname)
   1362 
   1363         https_request = AbstractHTTPHandler.do_request_

/usr/lib/python3.6/urllib/request.py in do_open(self, http_class, req, **http_conn_args)
   1318                           encode_chunked=req.has_header('Transfer-encoding'))
   1319             except OSError as err: # timeout error
-> 1320                 raise URLError(err)
   1321             r = h.getresponse()
   1322         except:

URLError: <urlopen error [Errno -2] Name or service not known>
In [3]:
prestige.head()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-3-087c62f057d1> in <module>()
----> 1 prestige.head()

NameError: name 'prestige' is not defined
In [4]:
prestige_model = ols("prestige ~ income + education", data=prestige).fit()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-4-dc387f7391aa> in <module>()
----> 1 prestige_model = ols("prestige ~ income + education", data=prestige).fit()

NameError: name 'prestige' is not defined
In [5]:
print(prestige_model.summary())
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-5-3f9f56dc18db> in <module>()
----> 1 print(prestige_model.summary())

NameError: name 'prestige_model' is not defined

Influence plots

Influence plots show the (externally) studentized residuals vs. the leverage of each observation as measured by the hat matrix.

Externally studentized residuals are residuals that are scaled by their standard deviation where

$$var(\hat{\epsilon}_i)=\hat{\sigma}^2_i(1-h_{ii})$$

with

$$\hat{\sigma}^2_i=\frac{1}{n - p - 1 \;\;}\sum_{j}^{n}\;\;\;\forall \;\;\; j \neq i$$

$n$ is the number of observations and $p$ is the number of regressors. $h_{ii}$ is the $i$-th diagonal element of the hat matrix

$$H=X(X^{\;\prime}X)^{-1}X^{\;\prime}$$

The influence of each point can be visualized by the criterion keyword argument. Options are Cook's distance and DFFITS, two measures of influence.

In [6]:
fig, ax = plt.subplots(figsize=(12,8))
fig = sm.graphics.influence_plot(prestige_model, ax=ax, criterion="cooks")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-6-f66273ab6ed1> in <module>()
      1 fig, ax = plt.subplots(figsize=(12,8))
----> 2 fig = sm.graphics.influence_plot(prestige_model, ax=ax, criterion="cooks")

NameError: name 'prestige_model' is not defined

As you can see there are a few worrisome observations. Both contractor and reporter have low leverage but a large residual.
RR.engineer has small residual and large leverage. Conductor and minister have both high leverage and large residuals, and,
therefore, large influence.

Partial Regression Plots

Since we are doing multivariate regressions, we cannot just look at individual bivariate plots to discern relationships.
Instead, we want to look at the relationship of the dependent variable and independent variables conditional on the other
independent variables. We can do this through using partial regression plots, otherwise known as added variable plots.

In a partial regression plot, to discern the relationship between the response variable and the $k$-th variabe, we compute
the residuals by regressing the response variable versus the independent variables excluding $X_k$. We can denote this by
$X_{\sim k}$. We then compute the residuals by regressing $X_k$ on $X_{\sim k}$. The partial regression plot is the plot
of the former versus the latter residuals.

The notable points of this plot are that the fitted line has slope $\beta_k$ and intercept zero. The residuals of this plot
are the same as those of the least squares fit of the original model with full $X$. You can discern the effects of the
individual data values on the estimation of a coefficient easily. If obs_labels is True, then these points are annotated
with their observation label. You can also see the violation of underlying assumptions such as homooskedasticity and
linearity.

In [7]:
fig, ax = plt.subplots(figsize=(12,8))
fig = sm.graphics.plot_partregress("prestige", "income", ["income", "education"], data=prestige, ax=ax)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-7-b778372af5cd> in <module>()
      1 fig, ax = plt.subplots(figsize=(12,8))
----> 2 fig = sm.graphics.plot_partregress("prestige", "income", ["income", "education"], data=prestige, ax=ax)

NameError: name 'prestige' is not defined
In [8]:
fix, ax = plt.subplots(figsize=(12,14))
fig = sm.graphics.plot_partregress("prestige", "income", ["education"], data=prestige, ax=ax)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-8-8c1565e62244> in <module>()
      1 fix, ax = plt.subplots(figsize=(12,14))
----> 2 fig = sm.graphics.plot_partregress("prestige", "income", ["education"], data=prestige, ax=ax)

NameError: name 'prestige' is not defined

As you can see the partial regression plot confirms the influence of conductor, minister, and RR.engineer on the partial relationship between income and prestige. The cases greatly decrease the effect of income on prestige. Dropping these cases confirms this.

In [9]:
subset = ~prestige.index.isin(["conductor", "RR.engineer", "minister"])
prestige_model2 = ols("prestige ~ income + education", data=prestige, subset=subset).fit()
print(prestige_model2.summary())
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-9-c3065da947ac> in <module>()
----> 1 subset = ~prestige.index.isin(["conductor", "RR.engineer", "minister"])
      2 prestige_model2 = ols("prestige ~ income + education", data=prestige, subset=subset).fit()
      3 print(prestige_model2.summary())

NameError: name 'prestige' is not defined

For a quick check of all the regressors, you can use plot_partregress_grid. These plots will not label the
points, but you can use them to identify problems and then use plot_partregress to get more information.

In [10]:
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_partregress_grid(prestige_model, fig=fig)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-10-2c5fe610dc0f> in <module>()
      1 fig = plt.figure(figsize=(12,8))
----> 2 fig = sm.graphics.plot_partregress_grid(prestige_model, fig=fig)

NameError: name 'prestige_model' is not defined
<Figure size 864x576 with 0 Axes>

Component-Component plus Residual (CCPR) Plots

The CCPR plot provides a way to judge the effect of one regressor on the
response variable by taking into account the effects of the other
independent variables. The partial residuals plot is defined as
$\text{Residuals} + B_iX_i \text{ }\text{ }$ versus $X_i$. The component adds $B_iX_i$ versus
$X_i$ to show where the fitted line would lie. Care should be taken if $X_i$
is highly correlated with any of the other independent variables. If this
is the case, the variance evident in the plot will be an underestimate of
the true variance.

In [11]:
fig, ax = plt.subplots(figsize=(12, 8))
fig = sm.graphics.plot_ccpr(prestige_model, "education", ax=ax)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-11-9bdf0dbc5b7a> in <module>()
      1 fig, ax = plt.subplots(figsize=(12, 8))
----> 2 fig = sm.graphics.plot_ccpr(prestige_model, "education", ax=ax)

NameError: name 'prestige_model' is not defined

As you can see the relationship between the variation in prestige explained by education conditional on income seems to be linear, though you can see there are some observations that are exerting considerable influence on the relationship. We can quickly look at more than one variable by using plot_ccpr_grid.

In [12]:
fig = plt.figure(figsize=(12, 8))
fig = sm.graphics.plot_ccpr_grid(prestige_model, fig=fig)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-12-5efa8010d4a4> in <module>()
      1 fig = plt.figure(figsize=(12, 8))
----> 2 fig = sm.graphics.plot_ccpr_grid(prestige_model, fig=fig)

NameError: name 'prestige_model' is not defined
<Figure size 864x576 with 0 Axes>

Regression Plots

The plot_regress_exog function is a convenience function that gives a 2x2 plot containing the dependent variable and fitted values with confidence intervals vs. the independent variable chosen, the residuals of the model vs. the chosen independent variable, a partial regression plot, and a CCPR plot. This function can be used for quickly checking modeling assumptions with respect to a single regressor.

In [13]:
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_regress_exog(prestige_model, "education", fig=fig)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-13-a057a0728815> in <module>()
      1 fig = plt.figure(figsize=(12,8))
----> 2 fig = sm.graphics.plot_regress_exog(prestige_model, "education", fig=fig)

NameError: name 'prestige_model' is not defined
<Figure size 864x576 with 0 Axes>

Fit Plot

The plot_fit function plots the fitted values versus a chosen independent variable. It includes prediction confidence intervals and optionally plots the true dependent variable.

In [14]:
fig, ax = plt.subplots(figsize=(12, 8))
fig = sm.graphics.plot_fit(prestige_model, "education", ax=ax)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-14-8d29314051b1> in <module>()
      1 fig, ax = plt.subplots(figsize=(12, 8))
----> 2 fig = sm.graphics.plot_fit(prestige_model, "education", ax=ax)

NameError: name 'prestige_model' is not defined

Statewide Crime 2009 Dataset

Compare the following to http://www.ats.ucla.edu/stat/stata/webbooks/reg/chapter4/statareg_self_assessment_answers4.htm

Though the data here is not the same as in that example. You could run that example by uncommenting the necessary cells below.

In [15]:
#dta = pd.read_csv("http://www.stat.ufl.edu/~aa/social/csv_files/statewide-crime-2.csv")
#dta = dta.set_index("State", inplace=True).dropna()
#dta.rename(columns={"VR" : "crime",
#                    "MR" : "murder",
#                    "M"  : "pctmetro",
#                    "W"  : "pctwhite",
#                    "H"  : "pcths",
#                    "P"  : "poverty",
#                    "S"  : "single"
#                    }, inplace=True)
#
#crime_model = ols("murder ~ pctmetro + poverty + pcths + single", data=dta).fit()
In [16]:
dta = sm.datasets.statecrime.load_pandas().data
/usr/lib/python3/dist-packages/numpy/lib/npyio.py:2230: VisibleDeprecationWarning: Reading unicode strings without specifying the encoding argument is deprecated. Set the encoding, use None for the system default.
  output = genfromtxt(fname, **kwargs)
In [17]:
crime_model = ols("murder ~ urban + poverty + hs_grad + single", data=dta).fit()
print(crime_model.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 murder   R-squared:                       0.813
Model:                            OLS   Adj. R-squared:                  0.797
Method:                 Least Squares   F-statistic:                     50.08
Date:                Mon, 02 Jul 2018   Prob (F-statistic):           3.42e-16
Time:                        14:18:18   Log-Likelihood:                -95.050
No. Observations:                  51   AIC:                             200.1
Df Residuals:                      46   BIC:                             209.8
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    -44.1024     12.086     -3.649      0.001     -68.430     -19.774
urban          0.0109      0.015      0.707      0.483      -0.020       0.042
poverty        0.4121      0.140      2.939      0.005       0.130       0.694
hs_grad        0.3059      0.117      2.611      0.012       0.070       0.542
single         0.6374      0.070      9.065      0.000       0.496       0.779
==============================================================================
Omnibus:                        1.618   Durbin-Watson:                   2.507
Prob(Omnibus):                  0.445   Jarque-Bera (JB):                0.831
Skew:                          -0.220   Prob(JB):                        0.660
Kurtosis:                       3.445   Cond. No.                     5.80e+03
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 5.8e+03. This might indicate that there are
strong multicollinearity or other numerical problems.

Partial Regression Plots

In [18]:
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_partregress_grid(crime_model, fig=fig)
In [19]:
fig, ax = plt.subplots(figsize=(12,8))
fig = sm.graphics.plot_partregress("murder", "hs_grad", ["urban", "poverty", "single"],  ax=ax, data=dta)

Leverage-Resid2 Plot

Closely related to the influence_plot is the leverage-resid2 plot.

In [20]:
fig, ax = plt.subplots(figsize=(8,6))
fig = sm.graphics.plot_leverage_resid2(crime_model, ax=ax)

Influence Plot

In [21]:
fig, ax = plt.subplots(figsize=(8,6))
fig = sm.graphics.influence_plot(crime_model, ax=ax)

Using robust regression to correct for outliers.

Part of the problem here in recreating the Stata results is that M-estimators are not robust to leverage points. MM-estimators should do better with this examples.

In [22]:
from statsmodels.formula.api import rlm
In [23]:
rob_crime_model = rlm("murder ~ urban + poverty + hs_grad + single", data=dta, 
                      M=sm.robust.norms.TukeyBiweight(3)).fit(conv="weights")
print(rob_crime_model.summary())
                    Robust linear Model Regression Results                    
==============================================================================
Dep. Variable:                 murder   No. Observations:                   51
Model:                            RLM   Df Residuals:                       46
Method:                          IRLS   Df Model:                            4
Norm:                   TukeyBiweight                                         
Scale Est.:                       mad                                         
Cov Type:                          H1                                         
Date:                Mon, 02 Jul 2018                                         
Time:                        14:18:20                                         
No. Iterations:                    50                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -4.2986      9.494     -0.453      0.651     -22.907      14.310
urban          0.0029      0.012      0.241      0.809      -0.021       0.027
poverty        0.2753      0.110      2.499      0.012       0.059       0.491
hs_grad       -0.0302      0.092     -0.328      0.743      -0.211       0.150
single         0.2902      0.055      5.253      0.000       0.182       0.398
==============================================================================

If the model instance has been used for another fit with different fit
parameters, then the fit options might not be the correct ones anymore .
In [24]:
#rob_crime_model = rlm("murder ~ pctmetro + poverty + pcths + single", data=dta, M=sm.robust.norms.TukeyBiweight()).fit(conv="weights")
#print(rob_crime_model.summary())

There isn't yet an influence diagnostics method as part of RLM, but we can recreate them. (This depends on the status of issue #888)

In [25]:
weights = rob_crime_model.weights
idx = weights > 0
X = rob_crime_model.model.exog[idx.values]
ww = weights[idx] / weights[idx].mean()
hat_matrix_diag = ww*(X*np.linalg.pinv(X).T).sum(1)
resid = rob_crime_model.resid
resid2 = resid**2
resid2 /= resid2.sum()
nobs = int(idx.sum())
hm = hat_matrix_diag.mean()
rm = resid2.mean()
In [26]:
from statsmodels.graphics import utils
fig, ax = plt.subplots(figsize=(12,8))
ax.plot(resid2[idx], hat_matrix_diag, 'o')
ax = utils.annotate_axes(range(nobs), labels=rob_crime_model.model.data.row_labels[idx], 
                    points=lzip(resid2[idx], hat_matrix_diag), offset_points=[(-5,5)]*nobs,
                    size="large", ax=ax)
ax.set_xlabel("resid2")
ax.set_ylabel("leverage")
ylim = ax.get_ylim()
ax.vlines(rm, *ylim)
xlim = ax.get_xlim()
ax.hlines(hm, *xlim)
ax.margins(0,0)