# Import pandas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
transfusion = pd.read_csv("transfusion.data")
transfusion.head()
Recency (months) | Frequency (times) | Monetary (c.c. blood) | Time (months) | whether he/she donated blood in March 2007 | |
---|---|---|---|---|---|
0 | 2 | 50 | 12500 | 98 | 1 |
1 | 0 | 13 | 3250 | 28 | 1 |
2 | 1 | 16 | 4000 | 35 | 1 |
3 | 2 | 20 | 5000 | 45 | 1 |
4 | 1 | 24 | 6000 | 77 | 0 |
transfusion.describe()
Recency (months) | Frequency (times) | Monetary (c.c. blood) | Time (months) | whether he/she donated blood in March 2007 | |
---|---|---|---|---|---|
count | 748.000000 | 748.000000 | 748.000000 | 748.000000 | 748.000000 |
mean | 9.506684 | 5.514706 | 1378.676471 | 34.282086 | 0.237968 |
std | 8.095396 | 5.839307 | 1459.826781 | 24.376714 | 0.426124 |
min | 0.000000 | 1.000000 | 250.000000 | 2.000000 | 0.000000 |
25% | 2.750000 | 2.000000 | 500.000000 | 16.000000 | 0.000000 |
50% | 7.000000 | 4.000000 | 1000.000000 | 28.000000 | 0.000000 |
75% | 14.000000 | 7.000000 | 1750.000000 | 50.000000 | 0.000000 |
max | 74.000000 | 50.000000 | 12500.000000 | 98.000000 | 1.000000 |
transfusion.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 748 entries, 0 to 747 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Recency (months) 748 non-null int64 1 Frequency (times) 748 non-null int64 2 Monetary (c.c. blood) 748 non-null int64 3 Time (months) 748 non-null int64 4 whether he/she donated blood in March 2007 748 non-null int64 dtypes: int64(5) memory usage: 29.3 KB
### Specifying target var
transfusion.rename(
columns={'whether he/she donated blood in March 2007': 'target'},
inplace=True
)
transfusion.head()
Recency (months) | Frequency (times) | Monetary (c.c. blood) | Time (months) | target | |
---|---|---|---|---|---|
0 | 2 | 50 | 12500 | 98 | 1 |
1 | 0 | 13 | 3250 | 28 | 1 |
2 | 1 | 16 | 4000 | 35 | 1 |
3 | 2 | 20 | 5000 | 45 | 1 |
4 | 1 | 24 | 6000 | 77 | 0 |
round(transfusion.describe(), 2)
Recency (months) | Frequency (times) | Monetary (c.c. blood) | Time (months) | target | |
---|---|---|---|---|---|
count | 748.00 | 748.00 | 748.00 | 748.00 | 748.00 |
mean | 9.51 | 5.51 | 1378.68 | 34.28 | 0.24 |
std | 8.10 | 5.84 | 1459.83 | 24.38 | 0.43 |
min | 0.00 | 1.00 | 250.00 | 2.00 | 0.00 |
25% | 2.75 | 2.00 | 500.00 | 16.00 | 0.00 |
50% | 7.00 | 4.00 | 1000.00 | 28.00 | 0.00 |
75% | 14.00 | 7.00 | 1750.00 | 50.00 | 0.00 |
max | 74.00 | 50.00 | 12500.00 | 98.00 | 1.00 |
targetProportions = transfusion.target.value_counts(normalize=True)
round(targetProportions, 3)
0 0.762 1 0.238 Name: target, dtype: float64
### Train-Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
transfusion.drop(columns='target'),
transfusion.target,
test_size=0.25,
random_state=42,
stratify=transfusion.target
)
X_train.head(2)
Recency (months) | Frequency (times) | Monetary (c.c. blood) | Time (months) | |
---|---|---|---|---|
334 | 16 | 2 | 500 | 16 |
99 | 5 | 7 | 1750 | 26 |
### Checking variances for normalization
round(X_train.var(), 3)
Recency (months) 66.929 Frequency (times) 33.830 Monetary (c.c. blood) 2114363.700 Time (months) 611.147 dtype: float64
X_train_normed, X_test_normed = X_train.copy(), X_test.copy()
normCol = 'Monetary (c.c. blood)'
for df_ in [X_train_normed, X_test_normed]:
df_['monetary_log'] = np.log(df_[normCol])
df_.drop(columns=normCol, inplace=True)
X_train_normed.var()
Recency (months) 66.929017 Frequency (times) 33.829819 Time (months) 611.146588 monetary_log 0.837458 dtype: float64
conda install numpy scipy scikit-learn pandas joblib pytorch
Collecting package metadata (current_repodata.json): done Solving environment: failed with initial frozen solve. Retrying with flexible solve. Solving environment: failed with repodata from current_repodata.json, will retry with next repodata source. Collecting package metadata (repodata.json): done Solving environment: failed with initial frozen solve. Retrying with flexible solve. Solving environment: | Found conflicts! Looking for incompatible packages. This can take several minutes. Press CTRL-C to abort. Examining conflict for pytables seaborn backports.functools_lru_cache pyqt dask| ^C failed CondaError: KeyboardInterrupt Note: you may need to restart the kernel to use updated packages.
from tpot import TPOTClassifier
from sklearn.metrics import roc_auc_score
# Instantiate TPOTClassifier
tpot = TPOTClassifier(
generations=5,
population_size=20,
verbosity=2,
scoring='roc_auc',
random_state=42,
disable_update_check=True,
config_dict='TPOT light'
)
tpot.fit(X_train, y_train)
# AUC score for tpot model
tpot_auc_score = roc_auc_score(y_test, tpot.predict_proba(X_test)[:, 1])
print(f'\nAUC score: {tpot_auc_score:.4f}')
# Print best pipeline steps
print('\nBest pipeline steps:', end='\n')
for idx, (name, transform) in enumerate(tpot.fitted_pipeline_.steps, start=1):
# Print idx and transform
print(f'{idx}. {transform}')
--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) <ipython-input-21-d19a2acc4cbe> in <module> ----> 1 from tpot import TPOTClassifier 2 from sklearn.metrics import roc_auc_score 3 4 # Instantiate TPOTClassifier 5 tpot = TPOTClassifier( ModuleNotFoundError: No module named 'tpot'
from sklearn import linear_model
# Instantiate LogisticRegression
logreg = linear_model.LogisticRegression(
solver='liblinear',
random_state=42
)
# Train the model
logreg.fit(X_train_normed, y_train)
# AUC score for tpot model
logreg_auc_score = roc_auc_score(y_test, logreg.predict_proba(X_test_normed)[:, 1])
print(f'\nAUC score: {logreg_auc_score:.4f}')
AUC score: 0.7891
logreg.fit(X_train_normed, y_train).get_params()
{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': 42, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
logreg.fit(X_train_normed, y_train).coef_[0]
array([-0.09094097, 0.09625316, -0.02637475, 0.28757045])
X_train.columns
Index(['Recency (months)', 'Frequency (times)', 'Monetary (c.c. blood)', 'Time (months)'], dtype='object')
print("Model accuracy using test split:", round(logreg.score(X_test, y_test), 2))
Model accuracy using test split: 0.76
X_train_normed.iloc[2]
Recency (months) 2.000000 Frequency (times) 7.000000 Time (months) 46.000000 monetary_log 7.467371 Name: 116, dtype: float64
# Predicting new entries
logreg.predict([[2, 7, 45, 7], [5, 28, 60, 7]])
array([0, 1])