Note
Click here to download the full example code
Multiclass sparse logisitic regression on newgroups20ΒΆ
Comparison of multinomial logistic L1 vs one-versus-rest L1 logistic regression to classify documents from the newgroups20 dataset. Multinomial logistic regression yields more accurate results and is faster to train on the larger scale dataset.
Here we use the l1 sparsity that trims the weights of not informative features to zero. This is good if the goal is to extract the strongly discriminative vocabulary of each class. If the goal is to get the best predictive accuracy, it is better to use the non sparsity-inducing l2 penalty instead.
A more traditional (and possibly better) way to predict on a sparse subset of input features would be to use univariate feature selection followed by a traditional (l2-penalised) logistic regression model.
Traceback (most recent call last):
File "/build/scikit-learn-6l_zuy/scikit-learn-0.20.2+dfsg/examples/linear_model/plot_sparse_logistic_regression_20newsgroups.py", line 42, in <module>
dataset = fetch_20newsgroups_vectorized('all')
File "/build/scikit-learn-6l_zuy/scikit-learn-0.20.2+dfsg/.pybuild/cpython3_3.7/build/sklearn/datasets/twenty_newsgroups.py", line 395, in fetch_20newsgroups_vectorized
download_if_missing=download_if_missing)
File "/build/scikit-learn-6l_zuy/scikit-learn-0.20.2+dfsg/.pybuild/cpython3_3.7/build/sklearn/datasets/twenty_newsgroups.py", line 248, in fetch_20newsgroups
cache_path=cache_path)
File "/build/scikit-learn-6l_zuy/scikit-learn-0.20.2+dfsg/.pybuild/cpython3_3.7/build/sklearn/datasets/twenty_newsgroups.py", line 81, in _download_20newsgroups
archive_path = _fetch_remote(ARCHIVE, dirname=target_dir)
File "/build/scikit-learn-6l_zuy/scikit-learn-0.20.2+dfsg/.pybuild/cpython3_3.7/build/sklearn/datasets/base.py", line 916, in _fetch_remote
urlretrieve(remote.url, file_path)
File "/usr/lib/python3.7/urllib/request.py", line 247, in urlretrieve
with contextlib.closing(urlopen(url, data)) as fp:
File "/usr/lib/python3.7/urllib/request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib/python3.7/urllib/request.py", line 525, in open
response = self._open(req, data)
File "/usr/lib/python3.7/urllib/request.py", line 543, in _open
'_open', req)
File "/usr/lib/python3.7/urllib/request.py", line 503, in _call_chain
result = func(*args)
File "/usr/lib/python3.7/urllib/request.py", line 1360, in https_open
context=self._context, check_hostname=self._check_hostname)
File "/usr/lib/python3.7/urllib/request.py", line 1319, in do_open
raise URLError(err)
urllib.error.URLError: <urlopen error [Errno 111] Connection refused>
import time
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import fetch_20newsgroups_vectorized
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
print(__doc__)
# Author: Arthur Mensch
t0 = time.clock()
# We use SAGA solver
solver = 'saga'
# Turn down for faster run time
n_samples = 10000
# Memorized fetch_rcv1 for faster access
dataset = fetch_20newsgroups_vectorized('all')
X = dataset.data
y = dataset.target
X = X[:n_samples]
y = y[:n_samples]
X_train, X_test, y_train, y_test = train_test_split(X, y,
random_state=42,
stratify=y,
test_size=0.1)
train_samples, n_features = X_train.shape
n_classes = np.unique(y).shape[0]
print('Dataset 20newsgroup, train_samples=%i, n_features=%i, n_classes=%i'
% (train_samples, n_features, n_classes))
models = {'ovr': {'name': 'One versus Rest', 'iters': [1, 3]},
'multinomial': {'name': 'Multinomial', 'iters': [1, 3, 7]}}
for model in models:
# Add initial chance-level values for plotting purpose
accuracies = [1 / n_classes]
times = [0]
densities = [1]
model_params = models[model]
# Small number of epochs for fast runtime
for this_max_iter in model_params['iters']:
print('[model=%s, solver=%s] Number of epochs: %s' %
(model_params['name'], solver, this_max_iter))
lr = LogisticRegression(solver=solver,
multi_class=model,
C=1,
penalty='l1',
fit_intercept=True,
max_iter=this_max_iter,
random_state=42,
)
t1 = time.clock()
lr.fit(X_train, y_train)
train_time = time.clock() - t1
y_pred = lr.predict(X_test)
accuracy = np.sum(y_pred == y_test) / y_test.shape[0]
density = np.mean(lr.coef_ != 0, axis=1) * 100
accuracies.append(accuracy)
densities.append(density)
times.append(train_time)
models[model]['times'] = times
models[model]['densities'] = densities
models[model]['accuracies'] = accuracies
print('Test accuracy for model %s: %.4f' % (model, accuracies[-1]))
print('%% non-zero coefficients for model %s, '
'per class:\n %s' % (model, densities[-1]))
print('Run time (%i epochs) for model %s:'
'%.2f' % (model_params['iters'][-1], model, times[-1]))
fig = plt.figure()
ax = fig.add_subplot(111)
for model in models:
name = models[model]['name']
times = models[model]['times']
accuracies = models[model]['accuracies']
ax.plot(times, accuracies, marker='o',
label='Model: %s' % name)
ax.set_xlabel('Train time (s)')
ax.set_ylabel('Test accuracy')
ax.legend()
fig.suptitle('Multinomial vs One-vs-Rest Logistic L1\n'
'Dataset %s' % '20newsgroups')
fig.tight_layout()
fig.subplots_adjust(top=0.85)
run_time = time.clock() - t0
print('Example run in %.3f s' % run_time)
plt.show()
Total running time of the script: ( 0 minutes 0.000 seconds)