
クロスバリデーションについては Cross-validation: evaluating estimator performance のドキュメントが詳しいので一読ください。


import pandas as pd
from sklearn import model_selection

data = pd.read_csv("input/pn_same_judge_preprocessed.csv")
train, test = model_selection.train_test_split(data, test_size=0.1, random_state=0)


sklearn.model_selection.KFold を使うと交差検証用にデータを分割できます。

from sklearn.model_selection import KFold

fold = KFold(n_splits=4, shuffle=True, random_state=0)

for fold_id, (train_idx, val_idx) in enumerate(fold.split(X=data["text"])):
    train_cv = data.iloc[train_idx]
    val_cv = data.iloc[val_idx]
    print(fold_id, train_cv.shape, val_cv.shape, train_cv.query('label_num == 1').shape)
0 (3139, 3) (1047, 3) (443, 3)
1 (3139, 3) (1047, 3) (454, 3)
2 (3140, 3) (1046, 3) (451, 3)
3 (3140, 3) (1046, 3) (458, 3)

ラベルの分布をtamotuniha sklearn.model_selection.StratifiedKFold を使います。

from sklearn.model_selection import StratifiedKFold

fold = StratifiedKFold(n_splits=4, shuffle=True, random_state=0)

for fold_id, (train_idx, val_idx) in enumerate(fold.split(X=data, y=data["label_num"])):
    train_cv = data.iloc[train_idx]
    val_cv = data.iloc[val_idx]
    print(fold_id, train_cv.shape, val_cv.shape, train_cv.query('label_num == 1').shape)
0 (3139, 3) (1047, 3) (451, 3)
1 (3139, 3) (1047, 3) (451, 3)
2 (3140, 3) (1046, 3) (452, 3)
3 (3140, 3) (1046, 3) (452, 3)


ある指標を最大化するパラメータを探索するには GridSearchCV が便利です。


参考: Stack Overflow

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ("vect", TfidfVectorizer(tokenizer=str.split)),
    ("clf", LogisticRegression())

params = [
        "clf": [MultinomialNB()],
        "clf": [LogisticRegression()],
        "clf__class_weight": [None, "balanced"],

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
search = GridSearchCV(pipe, params, scoring="average_precision", cv=cv, verbose=2, n_jobs=2)
search.fit(X=train["tokens"], y=train["label_num"])
Fitting 3 folds for each of 3 candidates, totalling 9 fits
GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=0, shuffle=True),
                                        TfidfVectorizer(tokenizer=<method 'split' of 'str' objects>)),
                                       ('clf', LogisticRegression())]),
             param_grid=[{'clf': [MultinomialNB()]},
                         {'clf': [LogisticRegression(class_weight='balanced')],
                          'clf__class_weight': [None, 'balanced']}],
             scoring='average_precision', verbose=2)
0 1 2
mean_fit_time 0.036518 0.064396 0.064668
std_fit_time 0.002994 0.003345 0.009993
mean_score_time 0.016316 0.014224 0.015696
std_score_time 0.002359 0.000241 0.000348
param_clf MultinomialNB() LogisticRegression(class_weight='balanced') LogisticRegression(class_weight='balanced')
param_clf__class_weight NaN None balanced
params {'clf': MultinomialNB()} {'clf': LogisticRegression(class_weight='balan... {'clf': LogisticRegression(class_weight='balan...
split0_test_score 0.731963 0.820622 0.825565
split1_test_score 0.72699 0.782203 0.789138
split2_test_score 0.756417 0.846741 0.849069
mean_test_score 0.738456 0.816522 0.821257
std_test_score 0.012861 0.026507 0.024655
rank_test_score 3 2 1
{'clf': LogisticRegression(class_weight='balanced'),
 'clf__class_weight': 'balanced'}


pipe.fit(X=train["tokens"], y=train["label_num"])
                 TfidfVectorizer(tokenizer=<method 'split' of 'str' objects>)),
                ('clf', LogisticRegression(class_weight='balanced'))])
proba = pipe.predict_proba(X=test["text"])


ある指標を最大化するだけでなく、例えばすべてのPRカーブを描くなどの操作が必要な場合には ハイパーパラメータの組み合わせに対して自分で学習、推論のコードを実装する必要があります。

ハイパーパラメータの組み合わせは ParameterGrid を使うことができます。


from sklearn.model_selection import ParameterGrid

params_example = [
    {"a": ["a1", "a2"], "b": ["b1", "b2"]},
    {"a": ["a3", "a4"], "b": ["b3", "b4"]},
[{'a': 'a1', 'b': 'b1'},
 {'a': 'a1', 'b': 'b2'},
 {'a': 'a2', 'b': 'b1'},
 {'a': 'a2', 'b': 'b2'},
 {'a': 'a3', 'b': 'b3'},
 {'a': 'a3', 'b': 'b4'},
 {'a': 'a4', 'b': 'b3'},
 {'a': 'a4', 'b': 'b4'}]

ParameterGridとset_paramsを使うことで次のように自分で 各パラメータの組み合わせに対してクロスバリデーションを実行することが可能になります。


  • 各イテレーションでは sklearn.base.clone でパイプラインのクローンを作成することで新しい分類器を作成して、前の結果に依存しないようにします。

  • パラメータ内に推論器が入っている場合があるので、 sklearn.utils.estimator_checks.check_estimator で推論器かチェックし、そうであればcloneします。

from sklearn.model_selection import ParameterGrid
from sklearn.base import clone
from sklearn.utils.estimator_checks import check_estimator
import numpy as np

def run_cv(pipe, params, cv, X, y):
    result = []
    for param in ParameterGrid(params):
        pred = np.zeros((len(X), ))
        for fold_id, (train_idx, test_idx) in enumerate(cv.split(X=X, y=y)):
            print("Fold:", fold_id)
            pipe_clone = clone(pipe)
            param_clone = dict()
            for key, val in param.items():
                    print(f"Clone estimator in parameter {key}: {val}")
                    param_clone[key] = clone(val)
                except (TypeError, ValueError):
                    # 推論器でない場合はcloneせずにそのまま値を使います
                    param_clone[key] = val
            pipe_clone.fit(X=X.iloc[train_idx], y=y.iloc[train_idx])
            # ここでは推定器にはpredict_probaがあることを想定しています。
            # この実装では、例えばSVCでは動作しないことに注意してください。
            pred[test_idx] = pipe_clone.predict_proba(X.iloc[test_idx])[:,1]

        result.append((param_clone, pred))
    return result
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
result = run_cv(pipe=pipe, params=params, cv=cv, X=train["tokens"], y=train["label_num"])
{'clf': MultinomialNB()}
Fold: 0
Clone estimator in parameter clf: MultinomialNB()
Fold: 1
Clone estimator in parameter clf: MultinomialNB()
Fold: 2
Clone estimator in parameter clf: MultinomialNB()
{'clf': LogisticRegression(class_weight='balanced'), 'clf__class_weight': None}
Fold: 0
Clone estimator in parameter clf: LogisticRegression(class_weight='balanced')
Fold: 1
Clone estimator in parameter clf: LogisticRegression(class_weight='balanced')
Fold: 2
Clone estimator in parameter clf: LogisticRegression(class_weight='balanced')
{'clf': LogisticRegression(class_weight='balanced'), 'clf__class_weight': 'balanced'}
Fold: 0
Clone estimator in parameter clf: LogisticRegression(class_weight='balanced')
Fold: 1
Clone estimator in parameter clf: LogisticRegression(class_weight='balanced')
Fold: 2
Clone estimator in parameter clf: LogisticRegression(class_weight='balanced')
for res in result:
    param, pred = res
{'clf': MultinomialNB()}
{'clf': LogisticRegression(), 'clf__class_weight': None}
{'clf': LogisticRegression(class_weight='balanced'), 'clf__class_weight': 'balanced'}