# Import pandas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

transfusion = pd.read_csv("transfusion.data")
transfusion.head()


transfusion.describe()


transfusion.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 5 columns):
 #   Column                                      Non-Null Count  Dtype
---  ------                                      --------------  -----
 0   Recency (months)                            748 non-null    int64
 1   Frequency (times)                           748 non-null    int64
 2   Monetary (c.c. blood)                       748 non-null    int64
 3   Time (months)                               748 non-null    int64
 4   whether he/she donated blood in March 2007  748 non-null    int64
dtypes: int64(5)
memory usage: 29.3 KB


### Specifying target var
transfusion.rename(
    columns={'whether he/she donated blood in March 2007': 'target'},
    inplace=True
)
transfusion.head()


round(transfusion.describe(), 2)


targetProportions = transfusion.target.value_counts(normalize=True)
round(targetProportions, 3)

0    0.762
1    0.238
Name: target, dtype: float64


### Train-Test Split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    transfusion.drop(columns='target'),
    transfusion.target,
    test_size=0.25,
    random_state=42,
    stratify=transfusion.target
)

X_train.head(2)


### Checking variances for normalization
round(X_train.var(), 3)

Recency (months)              66.929
Frequency (times)             33.830
Monetary (c.c. blood)    2114363.700
Time (months)                611.147
dtype: float64


X_train_normed, X_test_normed = X_train.copy(), X_test.copy()

normCol = 'Monetary (c.c. blood)'

for df_ in [X_train_normed, X_test_normed]:
    df_['monetary_log'] = np.log(df_[normCol])
    df_.drop(columns=normCol, inplace=True)

X_train_normed.var()

Recency (months)      66.929017
Frequency (times)     33.829819
Time (months)        611.146588
monetary_log           0.837458
dtype: float64


conda install numpy scipy scikit-learn pandas joblib pytorch

Collecting package metadata (current_repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Solving environment: failed with repodata from current_repodata.json, will retry with next repodata source.
Collecting package metadata (repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Solving environment: | 
Found conflicts! Looking for incompatible packages.
This can take several minutes.  Press CTRL-C to abort.
Examining conflict for pytables seaborn backports.functools_lru_cache pyqt dask| ^C
                                                                               failed

CondaError: KeyboardInterrupt


Note: you may need to restart the kernel to use updated packages.


from tpot import TPOTClassifier 
from sklearn.metrics import roc_auc_score

# Instantiate TPOTClassifier
tpot = TPOTClassifier(
    generations=5,
    population_size=20,
    verbosity=2,
    scoring='roc_auc',
    random_state=42,
    disable_update_check=True,
    config_dict='TPOT light'
)
tpot.fit(X_train, y_train)

# AUC score for tpot model
tpot_auc_score = roc_auc_score(y_test, tpot.predict_proba(X_test)[:, 1])
print(f'\nAUC score: {tpot_auc_score:.4f}')

# Print best pipeline steps
print('\nBest pipeline steps:', end='\n')
for idx, (name, transform) in enumerate(tpot.fitted_pipeline_.steps, start=1):
    # Print idx and transform
    print(f'{idx}. {transform}')

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-21-d19a2acc4cbe> in <module>
----> 1 from tpot import TPOTClassifier
      2 from sklearn.metrics import roc_auc_score
      3 
      4 # Instantiate TPOTClassifier
      5 tpot = TPOTClassifier(

ModuleNotFoundError: No module named 'tpot'


from sklearn import linear_model

# Instantiate LogisticRegression
logreg = linear_model.LogisticRegression(
    solver='liblinear',
    random_state=42
)

# Train the model
logreg.fit(X_train_normed, y_train)

# AUC score for tpot model
logreg_auc_score = roc_auc_score(y_test, logreg.predict_proba(X_test_normed)[:, 1])
print(f'\nAUC score: {logreg_auc_score:.4f}')

AUC score: 0.7891


logreg.fit(X_train_normed, y_train).get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 42,
 'solver': 'liblinear',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}


logreg.fit(X_train_normed, y_train).coef_[0]

array([-0.09094097,  0.09625316, -0.02637475,  0.28757045])


X_train.columns

Index(['Recency (months)', 'Frequency (times)', 'Monetary (c.c. blood)',
       'Time (months)'],
      dtype='object')


print("Model accuracy using test split:",  round(logreg.score(X_test, y_test), 2))

Model accuracy using test split: 0.76


X_train_normed.iloc[2]

Recency (months)      2.000000
Frequency (times)     7.000000
Time (months)        46.000000
monetary_log          7.467371
Name: 116, dtype: float64


# Predicting new entries 
logreg.predict([[2, 7, 45, 7], [5, 28, 60, 7]])

array([0, 1])

	Recency (months)	Frequency (times)	Monetary (c.c. blood)	Time (months)	whether he/she donated blood in March 2007
count	748.000000	748.000000	748.000000	748.000000	748.000000
mean	9.506684	5.514706	1378.676471	34.282086	0.237968
std	8.095396	5.839307	1459.826781	24.376714	0.426124
min	0.000000	1.000000	250.000000	2.000000	0.000000
25%	2.750000	2.000000	500.000000	16.000000	0.000000
50%	7.000000	4.000000	1000.000000	28.000000	0.000000
75%	14.000000	7.000000	1750.000000	50.000000	0.000000
max	74.000000	50.000000	12500.000000	98.000000	1.000000

	Recency (months)	Frequency (times)	Monetary (c.c. blood)	Time (months)	target
count	748.00	748.00	748.00	748.00	748.00
mean	9.51	5.51	1378.68	34.28	0.24
std	8.10	5.84	1459.83	24.38	0.43
min	0.00	1.00	250.00	2.00	0.00
25%	2.75	2.00	500.00	16.00	0.00
50%	7.00	4.00	1000.00	28.00	0.00
75%	14.00	7.00	1750.00	50.00	0.00
max	74.00	50.00	12500.00	98.00	1.00

Loading in the Data / Basic Exploration¶

Modelling the Data¶