1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
|
from __future__ import annotations
import logging
import os
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.dummy import DummyClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import KFold
from sec_certs.constants import RANDOM_STATE, REF_ANNOTATION_MODES
from sec_certs.model.references_nlp.feature_extraction import dataframe_to_training_arrays
logger = logging.getLogger(__name__)
def _train_model(
mode: REF_ANNOTATION_MODES,
x_train: np.ndarray,
y_train: np.ndarray,
x_eval: np.ndarray | None = None,
y_eval: np.ndarray | None = None,
learning_rate: float = 0.03,
depth: int = 6,
l2_leaf_reg: float = 3,
):
# In production mode, we don't have early stopping on validation set. Hence we use number of iterations that worked during evaluation.
n_iters = 20 if mode == "production" else 1000
clf = CatBoostClassifier(
learning_rate=learning_rate,
depth=depth,
l2_leaf_reg=l2_leaf_reg,
task_type="GPU",
devices=os.environ["CUDA_VISIBLE_DEVICES"],
random_seed=RANDOM_STATE,
iterations=n_iters,
)
train_pool = Pool(x_train, y_train)
eval_pool = Pool(x_eval, y_eval) if x_eval is not None else None
clf.fit(
train_pool,
eval_set=eval_pool,
verbose=False,
plot=True,
early_stopping_rounds=100,
use_best_model=True,
)
return clf
def train_model(
mode: REF_ANNOTATION_MODES,
x_train: np.ndarray,
y_train: np.ndarray,
x_eval: np.ndarray | None = None,
y_eval: np.ndarray | None = None,
train_baseline: bool = False,
learning_rate: float = 0.079573,
depth: int = 10,
l2_leaf_reg: float = 7.303517,
) -> DummyClassifier | CatBoostClassifier:
logger.info(f"Training model with baselne={train_baseline}")
if train_baseline:
clf = DummyClassifier(random_state=RANDOM_STATE)
clf.fit(x_train, y_train)
else:
clf = _train_model(
mode,
x_train,
y_train,
x_eval,
y_eval,
learning_rate,
depth,
l2_leaf_reg,
)
return clf
def cross_validate_model(
mode: REF_ANNOTATION_MODES, df: pd.DataFrame, learning_rate: float = 0.03, depth: int = 6, l2_leaf_reg: int = 3
) -> float:
logger.info("Cross-validating model")
X_train, y_train, _, _, _ = dataframe_to_training_arrays(df, "cross-validation", True, True, True, True)
kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
scores = []
for train_index, test_index in kf.split(X_train):
X_train_, X_test_ = X_train[train_index], X_train[test_index]
y_train_, y_test_ = y_train[train_index], y_train[test_index]
clf = _train_model(mode, X_train_, y_train_, X_test_, y_test_, learning_rate, depth, l2_leaf_reg)
scores.append(balanced_accuracy_score(y_test_, clf.predict(X_test_)))
return np.mean(scores)
|