Contents¶
InĀ [1]:
# %%
import pandas as pd
data = pd.read_csv("data/countdata_large_normed.csv")
Note: in whatever way you run Python (or Python IDE you use), make sure that
the working directory contains the subfolder "data/" that has "countdata_large_normed.csv"
otherwise, adjust the read_csv line accordingly
InĀ [2]:
# %%
# inspect the first 6 rows of the data
data.head()
Out[2]:
num_sentences | subst | subst_prop | verb | adj | kolon | punkt | komma | spm | utrop | ... | id | section_number | author | born | category | n_authors | publication_year | sex | title_monogr | topics | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.081678 | 0.203091 | 0.094923 | 0.152318 | 0.101545 | 0.015453 | 0.059603 | 0.044150 | 0.004415 | 0.0 | ... | AV01Af930008 | 0 | [] | [] | AV | 0 | 1993.0 | [] | Aftenposten | [] |
1 | 0.081319 | 0.217582 | 0.118681 | 0.147253 | 0.092308 | 0.000000 | 0.070330 | 0.046154 | 0.000000 | 0.0 | ... | AV01Af930016 | 0 | [] | [] | AV | 0 | 1993.0 | [] | Aftenposten | [] |
2 | 0.062241 | 0.236515 | 0.070539 | 0.151452 | 0.101660 | 0.002075 | 0.056017 | 0.035270 | 0.000000 | 0.0 | ... | AV01Af930019 | 0 | [] | [] | AV | 0 | 1993.0 | [] | Aftenposten | [] |
3 | 0.072144 | 0.232465 | 0.076152 | 0.140281 | 0.102204 | 0.000000 | 0.062124 | 0.036072 | 0.002004 | 0.0 | ... | AV01Af930023 | 0 | Jonassen, Arild M. | NaN | AV | 1 | 1993.0 | M | Aftenposten | [] |
4 | 0.076419 | 0.185590 | 0.030568 | 0.205240 | 0.093886 | 0.004367 | 0.065502 | 0.039301 | 0.006550 | 0.0 | ... | AV01Af930024 | 0 | Bisseberg, Aslaug | NaN | AV | 1 | 1993.0 | M | Aftenposten | [] |
5 rows Ć 24 columns
InĀ [3]:
# %%
data.rename(
columns={
"subst": "noun",
"subst_prop": "prop_noun",
"kolon": "colon",
"punkt": "period",
"komma": "comma",
"utrop": "exclamation",
},
inplace=True,
)
InĀ [4]:
# %%
print(data.columns)
print(data["category"].value_counts())
Index(['num_sentences', 'noun', 'prop_noun', 'verb', 'adj', 'colon', 'period', 'comma', 'spm', 'exclamation', 'overskrift', 'ukjent', 'mean_sent_length', 'num_words', 'id', 'section_number', 'author', 'born', 'category', 'n_authors', 'publication_year', 'sex', 'title_monogr', 'topics'], dtype='object') category AV 5205 SA 3000 SK 2999 Name: count, dtype: int64
InĀ [5]:
# %%
feature_cols = [
"category",
"num_sentences",
"noun",
"prop_noun",
"verb",
"adj",
"colon",
"period",
"comma",
"exclamation",
]
input = data[feature_cols]
print(input.columns)
Index(['category', 'num_sentences', 'noun', 'prop_noun', 'verb', 'adj', 'colon', 'period', 'comma', 'exclamation'], dtype='object')
InĀ [6]:
# %%
# datatypes
print(input.dtypes)
category object num_sentences float64 noun float64 prop_noun float64 verb float64 adj float64 colon float64 period float64 comma float64 exclamation float64 dtype: object
InĀ [7]:
# %%
import seaborn as sns
import matplotlib.pyplot as plt
sns.boxplot(data=input, x="category", y="prop_noun", hue="category")
Out[7]:
<Axes: xlabel='category', ylabel='prop_noun'>
InĀ [8]:
# %%
sns.boxplot(data=input, x="category", y="prop_noun", hue="category")
plt.ylim(0, 0.3)
Out[8]:
(0.0, 0.3)
InĀ [9]:
# %%
# plot noun ("subst") against category
sns.boxplot(data=data, x="category", y="noun", hue="category")
Out[9]:
<Axes: xlabel='category', ylabel='noun'>
InĀ [10]:
# %%
long_data = input.melt(id_vars="category", value_name="value", var_name="variable")
print(long_data)
category variable value 0 AV num_sentences 0.081678 1 AV num_sentences 0.081319 2 AV num_sentences 0.062241 3 AV num_sentences 0.072144 4 AV num_sentences 0.076419 ... ... ... ... 100831 SK exclamation 0.002371 100832 SK exclamation 0.003124 100833 SK exclamation 0.004098 100834 SK exclamation 0.004045 100835 SK exclamation 0.001170 [100836 rows x 3 columns]
InĀ [11]:
# %%
sns.boxplot(data=long_data, x="variable", y="value", hue="category")
Out[11]:
<Axes: xlabel='variable', ylabel='value'>
InĀ [12]:
# %%
g = sns.FacetGrid(data=long_data, col="variable", col_wrap=3)
g.map_dataframe(sns.boxplot, x="category", y="value")
g.set_titles("{col_name}") # by default, titels are "variable = {col_name}"
g.add_legend()
Out[12]:
<seaborn.axisgrid.FacetGrid at 0x7f61e61f90d0>
InĀ [13]:
# %%
sns.pairplot(data=input.iloc[:, 1:], markers=".", diag_kind="kde")
Out[13]:
<seaborn.axisgrid.PairGrid at 0x7f61e59670a0>
InĀ [14]:
# %%
input_bin = input[input["category"] != "SK"]
category_counts = input_bin["category"].value_counts()
print(category_counts)
category AV 5205 SA 3000 Name: count, dtype: int64
InĀ [15]:
# %%
from sklearn.model_selection import train_test_split
train_set_prop = 0.8
random_seed = 561
train, test = train_test_split(
input_bin,
train_size=train_set_prop,
stratify=input_bin["category"],
random_state=random_seed,
)
InĀ [16]:
# %%
print(train["category"].value_counts())
category AV 4164 SA 2400 Name: count, dtype: int64
InĀ [17]:
# %%
print(test["category"].value_counts())
category AV 1041 SA 600 Name: count, dtype: int64
InĀ [18]:
# %%
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
# Specify the number of trees for boosting
trees = 50
X_train = train.drop("category", axis=1)
le = LabelEncoder()
y_train = le.fit_transform(train["category"])
# Create an XGBClassifier model
# NOTE: Best practice to specify "objective", but does not work when
# running "juptyer" from command-line for some reason
# xgb_model = xgb.XGBClassifier(n_estimators=trees, objective='binary:logistic', random_state=169)
xgb_model = xgb.XGBClassifier(n_estimators=trees, random_state=169)
# Train the model
xgb_model.fit(X_train, y_train)
xgb_bin = xgb_model
InĀ [19]:
# %%
X_test = test.drop("category", axis=1)
test_predictions = xgb_model.predict(X_test)
test_predictions = le.inverse_transform(test_predictions)
# see some of the predictions
print(test_predictions[:10])
# the columns correspond to order defined in
# le.classes_
print(le.classes_)
['AV' 'AV' 'AV' 'AV' 'AV' 'AV' 'AV' 'SA' 'SA' 'SA'] ['AV' 'SA']
InĀ [20]:
# %%
test_probabilities = xgb_bin.predict_proba(X_test)
# Display the predicted probabilities
print(test_probabilities[:10])
# as a reminder: columns correspond to le.classes_
[[9.9912620e-01 8.7378256e-04] [8.3496362e-01 1.6503637e-01] [9.9264377e-01 7.3562474e-03] [9.9238342e-01 7.6165958e-03] [9.7537082e-01 2.4629178e-02] [9.9868077e-01 1.3192138e-03] [9.8774821e-01 1.2251808e-02] [8.5055828e-04 9.9914944e-01] [2.0072168e-01 7.9927832e-01] [9.1144443e-03 9.9088556e-01]]
InĀ [21]:
# %%
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
y_test = test["category"]
cm = confusion_matrix(y_test, test_predictions)
cm_display = ConfusionMatrixDisplay(cm, display_labels=le.classes_)
cm_display.plot()
Out[21]:
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f61d89c6130>
InĀ [22]:
# %%
# NOTE: in sklearn, computations aren't really done from confusion matrix, but just
# explain anyway
from sklearn.metrics import accuracy_score
test_accuracy = accuracy_score(y_test, test_predictions)
print(test_accuracy)
0.9518586227909811
InĀ [23]:
# %%
from sklearn.metrics import precision_score
# Compute precision score
xgb_precision = precision_score(y_test, test_predictions, average=None)
# print(xgb_precision)
print(pd.DataFrame({"category": le.classes_, "precision": xgb_precision}))
category precision 0 AV 0.954631 1 SA 0.946827
InĀ [24]:
# %%
from sklearn.metrics import recall_score
# Compute recall score
xgb_recall = recall_score(y_test, test_predictions, average=None)
# print(xgb_recall)
print(pd.DataFrame({"category": le.classes_, "recall": xgb_recall}))
category recall 0 AV 0.970221 1 SA 0.920000
InĀ [25]:
# %%
# Create a DataFrame with observed and predicted values
xgb_res = pd.DataFrame({"obs": y_test, "pred_SA": test_probabilities[:, 1]})
# plt.figure(figsize=(8, 6)) # not necessary
sns.boxplot(x="obs", y="pred_SA", showfliers=False, data=xgb_res, hue="obs")
sns.stripplot(x="obs", y="pred_SA", data=xgb_res, color="black", alpha=0.3)
plt.axhline(y=0.5, color="red", linestyle="--")
Out[25]:
<matplotlib.lines.Line2D at 0x7f62029b73d0>
InĀ [26]:
# %%
from IPython.display import SVG, display
display(SVG(filename="images/overfitting_runs.svg"))
InĀ [27]:
# %%
threshold_new = 0.8 #
xgb_pred_new = (test_probabilities[:, 1] >= threshold_new).astype(int)
xgb_pred_new = le.inverse_transform(xgb_pred_new)
InĀ [28]:
# %%
xgb_conf_new = confusion_matrix(y_test, xgb_pred_new)
xgb_cm_display = ConfusionMatrixDisplay(xgb_conf_new, display_labels=le.classes_)
xgb_cm_display.plot()
Out[28]:
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f6202a34340>
InĀ [29]:
# %%
print(accuracy_score(y_test, xgb_pred_new))
0.947592931139549
InĀ [30]:
# %%
print(precision_score(y_test, xgb_pred_new, average=None))
[0.93134598 0.98127341]
InĀ [31]:
# %%
print(recall_score(test["category"], xgb_pred_new, average=None))
[0.99039385 0.87333333]
InĀ [32]:
# %%
from sklearn.metrics import classification_report
print(classification_report(y_test, xgb_pred_new))
precision recall f1-score support AV 0.93 0.99 0.96 1041 SA 0.98 0.87 0.92 600 accuracy 0.95 1641 macro avg 0.96 0.93 0.94 1641 weighted avg 0.95 0.95 0.95 1641
InĀ [33]:
# %%
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(y_test, xgb_pred_new)
Out[33]:
(array([0.93134598, 0.98127341]), array([0.99039385, 0.87333333]), array([0.95996276, 0.92416226]), array([1041, 600]))
InĀ [34]:
# %%
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.RocCurveDisplay.html
# (there is also alternative code provided)
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(
y_test, test_probabilities[:, 1], pos_label="SA"
)
roc_auc = metrics.auc(fpr, tpr)
roc_display = metrics.RocCurveDisplay(
fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name="example estimator"
)
roc_display.plot()
Out[34]:
<sklearn.metrics._plot.roc_curve.RocCurveDisplay at 0x7f6202af76d0>
InĀ [35]:
# %%
print(roc_auc)
0.9870845341018252
InĀ [36]:
# %%
feature_importances = pd.DataFrame(
{
"feature": xgb_model.feature_names_in_,
"importance": xgb_model.feature_importances_,
}
)
print(feature_importances)
feature importance 0 num_sentences 0.058148 1 noun 0.043832 2 prop_noun 0.248151 3 verb 0.033303 4 adj 0.039990 5 colon 0.234883 6 period 0.036060 7 comma 0.026662 8 exclamation 0.278972
InĀ [37]:
# %%
feature_importances = feature_importances.sort_values(by="importance", ascending=False)
# it is ok, but need to adjust labels
# plt.bar(feature_importances["feature"], feature_importances["importance"])
# plt.barh(feature_importances["feature"], feature_importances["importance"])
sns.barplot(data=feature_importances, x="importance", y="feature", orient="h")
Out[37]:
<Axes: xlabel='importance', ylabel='feature'>
InĀ [38]:
# %%
train_set_prop = 0.8
random_seed = 561
train, test = train_test_split(
input,
train_size=train_set_prop,
stratify=input["category"],
random_state=random_seed,
)
InĀ [39]:
# %%
# Lets quickly examine the size of our training set (now with three categories)
# <br>
train["category"].value_counts()
Out[39]:
category AV 4164 SA 2400 SK 2399 Name: count, dtype: int64
InĀ [40]:
# %%
# and test set
test["category"].value_counts()
Out[40]:
category AV 1041 SK 600 SA 600 Name: count, dtype: int64
InĀ [41]:
# %%
# and finally, build and our model and predict the categories
# <br>
# NOTE: Best practice to specify "objective", but does not work when
# running "juptyer" from command-line for some reason
# xgb_model = xgb.XGBClassifier(n_estimators=trees, objective='multi:softmax', random_state=169)
xgb_model = xgb.XGBClassifier(n_estimators=trees, random_state=169)
X_train = train.drop("category", axis=1)
y_train = train["category"]
y_train = le.fit_transform(y_train)
xgb_model.fit(X_train, y_train)
X_test = test.drop("category", axis=1)
y_test = test["category"]
test_predictions = xgb_model.predict(X_test)
test_predictions = le.inverse_transform(test_predictions)
test_probabilities = xgb_model.predict_proba(X_test)
InĀ [42]:
# %%
# Make a confusion matrix
xgb_conf = confusion_matrix(y_test, test_predictions)
cm_multi = ConfusionMatrixDisplay(xgb_conf, display_labels=le.classes_)
cm_multi.plot()
Out[42]:
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f61d89c6520>
InĀ [43]:
# %%
accuracy_multi = accuracy_score(y_test, test_predictions)
print(accuracy_multi)
0.9219098616688978
InĀ [44]:
# %%
multi_prec = precision_score(y_test, test_predictions, average=None)
# print(multi_prec)
print(pd.DataFrame({"category": le.classes_, "recall": multi_prec}))
category recall 0 AV 0.949153 1 SA 0.890071 2 SK 0.904065
InĀ [45]:
# %%
multi_recall = recall_score(y_test, test_predictions, average=None)
# print(multi_recall)
print(pd.DataFrame({"category": le.classes_, "recall": multi_recall}))
category recall 0 AV 0.968300 1 SA 0.836667 2 SK 0.926667
InĀ [46]:
# %%
print(classification_report(y_test, test_predictions))
precision_recall_fscore_support(y_test, test_predictions)
precision recall f1-score support AV 0.95 0.97 0.96 1041 SA 0.89 0.84 0.86 600 SK 0.90 0.93 0.92 600 accuracy 0.92 2241 macro avg 0.91 0.91 0.91 2241 weighted avg 0.92 0.92 0.92 2241
Out[46]:
(array([0.94915254, 0.89007092, 0.90406504]), array([0.96829971, 0.83666667, 0.92666667]), array([0.95863053, 0.86254296, 0.91522634]), array([1041, 600, 600]))
InĀ [47]:
# %%
print(precision_score(y_test, test_predictions, average="weighted"))
0.9212625498402064
InĀ [48]:
# %%
print(recall_score(y_test, test_predictions, average="weighted"))
0.9219098616688978
InĀ [49]:
# %%
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, test_probabilities, multi_class="ovr")
Out[49]:
0.9862762605605674
InĀ [50]:
# %%
# Create a DataFrame with predicted probabilities and real observations
test_plot = pd.DataFrame(
{
"obs": y_test,
"pred_" + le.classes_[0]: test_probabilities[:, 0],
"pred_" + le.classes_[1]: test_probabilities[:, 1],
"pred_" + le.classes_[2]: test_probabilities[:, 2],
}
)
# plot results
# plt.figure(figsize=(8, 6)) # not necessary
sns.boxplot(x="obs", y="pred_SA", showfliers=False, data=test_plot, hue="obs")
sns.stripplot(x="obs", y="pred_SA", data=test_plot, color="black", alpha=0.3)
plt.axhline(y=0.3, color="red", linestyle="--")
Out[50]:
<matplotlib.lines.Line2D at 0x7f61bc3245e0>
InĀ [51]:
# %%
# Reshape the DataFrame to long format
test_plot_long = test_plot.melt(id_vars="obs", value_name="value", var_name="variable")
# split into subplots
g = sns.FacetGrid(data=test_plot_long, col="variable")
g.map_dataframe(sns.boxplot, x="obs", y="value")
g.set_titles("{col_name}") # by default, titels are "variable = {col_name}"
g.add_legend()
Out[51]:
<seaborn.axisgrid.FacetGrid at 0x7f61bc3511f0>
InĀ [52]:
# %%
# Get feature importances
feature_importances = pd.DataFrame(
{
"feature": xgb_model.feature_names_in_,
"importance": xgb_model.feature_importances_,
}
)
print(feature_importances)
feature importance 0 num_sentences 0.054477 1 noun 0.214677 2 prop_noun 0.200245 3 verb 0.048773 4 adj 0.063865 5 colon 0.138989 6 period 0.035242 7 comma 0.059009 8 exclamation 0.184724
InĀ [53]:
# %%
feature_importances = feature_importances.sort_values(by="importance", ascending=False)
# it is ok, but need to adjust labels
# plt.bar(feature_importances["feature"], feature_importances["importance"])
# plt.barh(feature_importances["feature"], feature_importances["importance"])
sns.barplot(data=feature_importances, x="importance", y="feature", orient="h")
Out[53]:
<Axes: xlabel='importance', ylabel='feature'>
Exercise (optional) ¶
- Fit a multiclass classification model, where you pick hyperparamater values
from a certain range.
DON'T SHOW THE CODE TO THE PARTICIPANTS (give them 5-10 minutes)!!!!
We will talk later about what these hyperparaameters represent during hyperaparameter tuning
(if we get that far)
Default values:
InĀ [54]:
# %%
# 1) set up your xgb model with specified values for hyperparameters
InĀ [55]:
# %%
# 2) Calculate model evaluation metrics (accuracy, macro-precision, macro-recall, auc)
# (use X_train, y_train, X_test, y_test that we have made before)
InĀ [56]:
# %%
display(SVG(filename="images/CV_image.svg"))
InĀ [57]:
# %%
from sklearn.model_selection import cross_val_score
# Define your XGBoost model
xgb_mod = xgb.XGBClassifier(n_estimators=50)
# Prepare the data
X = input_bin.drop("category", axis=1) # Features
y = input_bin["category"] # Target variable
y = le.fit_transform(y)
# Perform cross-validation
# NOTE: just show this is one way, but then do the other way (in next chunk)
scores = cross_val_score(xgb_mod, X, y, cv=10, scoring="accuracy")
# Print the cross-validation scores
print("Average Accuracy:", scores.mean())
Average Accuracy: 0.9471046611805948
InĀ [58]:
# %%
print("Cross-Validation scores:", scores)
Cross-Validation scores: [0.95980512 0.93422655 0.92813642 0.95371498 0.96589525 0.93414634 0.94512195 0.93170732 0.96707317 0.95121951]
InĀ [59]:
# %%
from sklearn.model_selection import cross_validate
# Example with single metric
# cv_results = cross_validate(xgb_mod, X, y, cv=10, scoring="accuracy")
# print(cv_results["test_score"])
scoring = {"acc": "accuracy", "prec": "precision", "rec": "recall", "auc": "roc_auc"}
cv_results = cross_validate(xgb_mod, X, y, cv=10, scoring=scoring)
print(cv_results["test_acc"])
print(cv_results["test_prec"])
print(cv_results["test_rec"])
print(cv_results["test_auc"])
[0.95980512 0.93422655 0.92813642 0.95371498 0.96589525 0.93414634 0.94512195 0.93170732 0.96707317 0.95121951] [0.94352159 0.96590909 0.92882562 0.96126761 0.97887324 0.95895522 0.96363636 0.98412698 0.98233216 0.95138889] [0.94666667 0.85 0.87 0.91 0.92666667 0.85666667 0.88333333 0.82666667 0.92666667 0.91333333] [0.9887396 0.98150352 0.98099808 0.98577735 0.98774152 0.9763141 0.983 0.98339103 0.99599359 0.98277564]
InĀ [60]:
# %%
import numpy as np
accuracy_scores = cv_results["test_acc"]
accuracy_mean = accuracy_scores.mean()
accuracy_std = np.std(accuracy_scores)
print(accuracy_std)
0.013814212785095181
InĀ [61]:
# %%
accuracy_se = accuracy_std / np.sqrt(len(accuracy_scores))
print(accuracy_se)
0.00436843764831189
Optional: run with parallel processing
If you want to, manually set the variable run_parallel
to TRUE
(by default it is set to FALSE)
InĀ [62]:
# %%
num_cores = 1 # if no parallel processing
# run_parallel = False
run_parallel = True
if run_parallel:
import multiprocessing
import math
total_num_cores = multiprocessing.cpu_count()
num_cores = math.ceil(total_num_cores / 2)
print(f"You will be using {num_cores=}")
You will be using num_cores=4
InĀ [63]:
# %%
from sklearn.model_selection import RepeatedStratifiedKFold
n_folds = 10
n_repeats = 50
rskf = RepeatedStratifiedKFold(n_splits=n_folds, n_repeats=n_repeats, random_state=42)
# NOTE: if num_cores is 1, or less than 3, consider using
# lower n_estimators (e.g. 5)
xgb_mod = xgb.XGBClassifier(n_estimators=25)
cv_rep = cross_validate(xgb_mod, X, y, cv=rskf, scoring=scoring, n_jobs=num_cores)
InĀ [64]:
# %%
print(cv_rep["test_acc"].mean())
0.9537652089955736
InĀ [65]:
# %%
if n_folds * n_repeats == len(cv_rep["test_acc"]):
print(f"The total number of estimates is {n_folds * n_repeats}")
The total number of estimates is 500
InĀ [66]:
# %%
# Create a histogram of the accuracy scores
sns.histplot(data=cv_rep, x="test_acc")
Out[66]:
<Axes: xlabel='test_acc', ylabel='Count'>
InĀ [67]:
# %%
# Calculate the mean accuracy for each repetition
rep_acc = np.mean(cv_rep["test_acc"].reshape((n_repeats, n_folds)), axis=1)
# Create a histogram of the mean accuracies
sns.histplot(data=rep_acc)
Out[67]:
<Axes: ylabel='Count'>
InĀ [68]:
# %%
print(np.std(rep_acc) / np.sqrt(len(rep_acc)))
0.00015532439257155497
InĀ [69]:
# %%
X = input.drop("category", axis=1)
y = input["category"]
y = le.fit_transform(y)
# test_predictions = le.inverse_transform(test_predictions)
n_folds = 10
n_repeats = 25
rskf = RepeatedStratifiedKFold(n_splits=n_folds, n_repeats=n_repeats, random_state=42)
from sklearn.metrics import make_scorer
scoring = {
"acc": "accuracy",
"prec": make_scorer(precision_score, average="weighted"),
"rec": make_scorer(recall_score, average="weighted"),
"auc": "roc_auc_ovr",
}
Exercise:
1)Get mean accuracy, auc, precision, recall
2)Plot a histogram for all the accuracy estimates
InĀ [70]:
# %%
# Calculate the number of rows to select
num_rows_to_select = int(len(input) * 0.05)
# Randomly select the specified number of rows
small = input.sample(n=num_rows_to_select)
print(f"The number of rows is {len(small)}")
X = small.drop("category", axis=1)
y = small["category"]
y = le.fit_transform(y)
n_folds = 10
n_repeats = 50
rskf = RepeatedStratifiedKFold(n_splits=n_folds, n_repeats=n_repeats, random_state=42)
The number of rows is 560
InĀ [71]:
# %%
cv_small = cross_validate(xgb_mod, X, y, cv=rskf, scoring=scoring, n_jobs=num_cores)
print(cv_small["test_acc"].mean())
0.8383214285714286
InĀ [72]:
# %%
sns.histplot(data=cv_small, x="test_acc")
Out[72]:
<Axes: xlabel='test_acc', ylabel='Count'>
InĀ [73]:
# %%
# Calculate the mean accuracy for each repetition
mean_accuracies = np.mean(cv_small["test_acc"].reshape((n_repeats, n_folds)), axis=1)
# Create a histogram of the mean accuracies
sns.histplot(data=mean_accuracies)
Out[73]:
<Axes: ylabel='Count'>
InĀ [74]:
# %%
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
train, test = train_test_split(
input_bin,
train_size=train_set_prop,
stratify=input_bin["category"],
random_state=random_seed,
)
X_train = train.drop("category", axis=1)
y_train = le.fit_transform(train["category"])
X_test = test.drop("category", axis=1)
y_test = le.fit_transform(test["category"])
pipe = make_pipeline(StandardScaler(), LogisticRegression())
# make_pipeline names each "step", and in this case, this is
# what actually happens
# from sklearn.pipeline import Pipeline
# pipe = Pipeline([
# ('standardscaler', StandardScaler()),
# ('logisticregression', LogisticRegression())
# ])
# apply scaling on training data
pipe.fit(X_train, y_train)
Out[74]:
Pipeline(steps=[('standardscaler', StandardScaler()), ('logisticregression', LogisticRegression())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('standardscaler', StandardScaler()), ('logisticregression', LogisticRegression())])
StandardScaler()
LogisticRegression()
InĀ [75]:
# %%
# https://scikit-learn.org/stable/modules/preprocessing.html
# Use pipeline to predict features of test set
pipe.predict(X_test)
# shortcut to get out accuracy for train test split
pipe.score(X_test, y_test)
Out[75]:
0.7568555758683729
InĀ [76]:
# %%
# Use the custom-named step to transform X_train
X_train_transformed = pipe.named_steps["standardscaler"].transform(X_train)
print(X_train_transformed[5, :11])
# Use the custom-named step to transform X_test
X_test_transformed = pipe.named_steps["standardscaler"].transform(X_test)
print(X_test_transformed[5, :11])
[-0.16222738 -0.27827977 0.86882101 -1.63465125 0.32094477 -0.74909501 -0.3205673 0.0520682 -0.34894426] [ 1.12439585 -1.09384549 0.09764135 0.71824203 -0.45310936 -0.74909501 1.33716399 0.98727639 -0.34894426]
And pipeline can be used in cross-validation
InĀ [77]:
# %%
cv_pipe = cross_validate(pipe, X_train, y_train, cv=10, scoring=scoring)
InĀ [78]:
# %%
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# prepare models
models = []
models.append(("LR", LogisticRegression()))
models.append(("LDA", LinearDiscriminantAnalysis()))
models.append(("XGB", xgb.XGBClassifier()))
X = input_bin.drop("category", axis=1)
y = le.fit_transform(input_bin["category"])
n_folds = 5
n_repeats = 5
rskf = RepeatedStratifiedKFold(n_splits=n_folds, n_repeats=n_repeats, random_state=55)
scoring = "roc_auc"
all_models = pd.DataFrame()
all_models_repetitions = pd.DataFrame()
for name, model in models:
print(f"building and evaluating for {name}")
model_results = cross_validate(model, X, y, cv=rskf, scoring=scoring)
all_models[name] = model_results["test_score"]
all_models_repetitions[name] = np.mean(model_results["test_score"].reshape((n_repeats, n_folds)), axis=1)
building and evaluating for LR
building and evaluating for LDA
building and evaluating for XGB
InĀ [79]:
# %%
# compare distributions with histograms
# NOTE: it is more correct to use all_models_repetitions (one point estimate per repetition),
# but we are not right now because we are only using 5 folds and 5 repetitions
sns.histplot(all_models, bins=10, alpha=0.5)
Out[79]:
<Axes: ylabel='Count'>
InĀ [80]:
# %%
# NOTE: it is more correct to use all_models_repetitions (one point estimate per repetition),
# but we are not right now because we are only using 5 folds and 5 repetitions
from scipy.stats import ttest_rel
t_statistic, p_value = ttest_rel(all_models["XGB"], all_models["LDA"])
print(f"{t_statistic=}, {p_value=}")
t_statistic, p_value = ttest_rel(all_models["XGB"], all_models["LR"])
print(f"{t_statistic=}, {p_value=}")
t_statistic, p_value = ttest_rel(all_models["LDA"], all_models["LR"])
print(f"{t_statistic=}, {p_value=}")
t_statistic=67.62612831299954, p_value=6.620171536491319e-29 t_statistic=79.25042298476626, p_value=1.495189585890978e-30 t_statistic=23.663779200561276, p_value=3.8152255647893004e-18
InĀ [81]:
# %%
from sklearn.model_selection import RandomizedSearchCV
import scipy.stats as stats
# Define the hyperparameter distributions
param_dist = {
"max_depth": stats.randint(3, 10),
"learning_rate": stats.uniform(),
"subsample": stats.uniform(),
}
InĀ [82]:
# %%
scoring = "roc_auc"
# Alternative with multiple metrics, but see below
# scoring = {
# 'acc': 'accuracy',
# 'prec': make_scorer(precision_score, average="weighted"),
# 'rec': make_scorer(recall_score, average="weighted"),
# 'auc': 'roc_auc_ovr'
# }
xgb_auto = xgb.XGBClassifier()
# Create the RandomizedSearchCV object
# NOTE: set `refit=FALSE` if scoring contains multiple metrics
# n_iter determines the number of parameter settings to try (default is 10)
# i.e. 10 total number of combinations that you input in param_distributions
random_search = RandomizedSearchCV(
xgb_auto, param_distributions=param_dist, n_iter=10, cv=10, scoring=scoring
)
X = input_bin.drop("category", axis=1)
y = le.fit_transform(input_bin["category"])
# Fit the RandomizedSearchCV object to the training data
random_search.fit(X, y)
Out[82]:
RandomizedSearchCV(cv=10, estimator=XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, device=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rat... num_parallel_tree=None, random_state=None, ...), param_distributions={'learning_rate': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x7f61f31d1190>, 'max_depth': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x7f61f31cba90>, 'subsample': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x7f61f31db430>}, scoring='roc_auc')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomizedSearchCV(cv=10, estimator=XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, device=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rat... num_parallel_tree=None, random_state=None, ...), param_distributions={'learning_rate': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x7f61f31d1190>, 'max_depth': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x7f61f31cba90>, 'subsample': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x7f61f31db430>}, scoring='roc_auc')
XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, device=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, multi_strategy=None, n_estimators=None, n_jobs=None, num_parallel_tree=None, random_state=None, ...)
XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, device=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, multi_strategy=None, n_estimators=None, n_jobs=None, num_parallel_tree=None, random_state=None, ...)
InĀ [83]:
# %%
# Print the best set of hyperparameters and the corresponding score
print("Best set of hyperparameters: ", random_search.best_params_)
print("Best score: ", random_search.best_score_)
Best set of hyperparameters: {'learning_rate': 0.17860733964061282, 'max_depth': 8, 'subsample': 0.6537011094472672} Best score: 0.9851547726266057
InĀ [84]:
# %%
print(random_search.cv_results_.keys())
dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_learning_rate', 'param_max_depth', 'param_subsample', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'split5_test_score', 'split6_test_score', 'split7_test_score', 'split8_test_score', 'split9_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])
InĀ [85]:
# %%
from sklearn.model_selection import GridSearchCV
# Define the hyperparameter grid
param_grid = {
"max_depth": [3, 5, 7],
"learning_rate": [0.1, 0.01, 0.001],
"subsample": [0.5, 0.7, 1],
}
# Create the RandomizedSearchCV object
grid_search = GridSearchCV(xgb_auto, param_grid, cv=10, scoring=scoring)
grid_search.fit(X, y)
# Print the best set of hyperparameters and the corresponding score
print("Best set of hyperparameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)
Best set of hyperparameters: {'learning_rate': 0.1, 'max_depth': 5, 'subsample': 0.7} Best score: 0.9843496690289877
InĀ [86]:
# %%
X = input.drop("num_sentences", axis=1)
print(input.dtypes)
X["category"] = le.fit_transform(X["category"])
category object num_sentences float64 noun float64 prop_noun float64 verb float64 adj float64 colon float64 period float64 comma float64 exclamation float64 dtype: object
InĀ [87]:
# %%
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score
from sklearn.model_selection import KFold
# from sklearn.model_selection import RepeatedKFold
y = input["num_sentences"]
scoring = {
"mae": make_scorer(mean_absolute_error),
"rmse": make_scorer(root_mean_squared_error),
"r2": make_scorer(r2_score),
}
xgb_reg = xgb.XGBRegressor(n_estimators=50)
cv_folds = KFold(n_splits=10)
cv_reg = cross_validate(xgb_reg, X, y, scoring=scoring, cv=cv_folds, n_jobs=num_cores)
# alternative if not plotting / not using cross_val_predict
# n_folds = 10
# n_repeats = 25
# rkf = RepeatedKFold(n_splits=n_folds, n_repeats=n_repeats, random_state=42)
# cv_reg = cross_validate(xgb_reg, X, y, scoring=scoring, cv=rkf, n_jobs=num_cores)
InĀ [88]:
# %%
# get out predicted results
from sklearn.model_selection import cross_val_predict
# NOTE:
# Even though you partition folds with KFold, RepatedKFold or similar,
# you are running cross validation twice if using both cross_validate and cross_val_predict.
# This can be computationally expensive if you have a complex model (e.g. high N and high P).
# There is paramater "return_estimator" in cross_validate, but would
# require more steps to run predictions
# (but just for illustrative purposes anyway)
y_pred = cross_val_predict(xgb_reg, X, y, cv=cv_folds)
InĀ [89]:
# %%
# plot
# sns.scatterplot(x=y, y=y_pred)
# includes regression line
sns.regplot(x=y, y=y_pred, scatter_kws={"alpha": 0.1}, line_kws={"color": "grey"})
plt.axis("scaled") # alternative 'square'
plt.xlabel("Actual y")
plt.ylabel("Predicted y")
plt.title("Scatter Plot of Actual vs Predicted")
Out[89]:
Text(0.5, 1.0, 'Scatter Plot of Actual vs Predicted')
InĀ [90]:
# %%
print(f'Mean MAE is {cv_reg["test_mae"].mean()}')
print(f'Mean RMSE is {cv_reg["test_rmse"].mean()}')
print(f'Mean R^2 is {cv_reg["test_r2"].mean()}')
Mean MAE is 0.005218423199237467 Mean RMSE is 0.007486043954101166 Mean R^2 is 0.7829877383464395