import pandas as pd
from sklearn import model_selection
data = pd.read_csv("input/pn_same_judge_preprocessed.csv")
train, test = model_selection.train_test_split(data, test_size=0.1, random_state=0)
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.linear_model import LogisticRegression
import numpy as np
単語埋め込みにはspaCyの埋め込みを使います。 まずはspaCyのトークナイザをロードします。
import spacy
nlp = spacy.load("ja_core_news_md")
def tokenize(text):
return [token.lemma_ for token in nlp(text)]
# TransoformerMixinを継承すれば fit_trainsform は fit, transform にあわせて実装される
from sklearn.base import TransformerMixin
class DenseVectorizer(TransformerMixin):
def __init__(self, nlp):
self._nlp = nlp
def fit(self, X, y=None):
return self
def transform(self, X):
return np.array([self._nlp(text).vector for text in X])
pipe_word = Pipeline([
("vect", DenseVectorizer(nlp=nlp)),
("clf", LogisticRegression(class_weight="balanced"))
pipe_word.fit(train["text"], train["label_num"])
score_word = pipe_word.predict_proba(test["text"])[:,1]
from sklearn.base import TransformerMixin
import tensorflow_hub as hub
import numpy as np
import tensorflow_text
class UseVectorizer(TransformerMixin):
def __init__(self):
self._embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")
def fit(self, X, y=None):
return self
def transform(self, X):
return self._embed(X)
pipe_sent = Pipeline([
("vect", UseVectorizer()),
("clf", LogisticRegression(class_weight="balanced"))
pipe_sent.fit(train["text"], train["label_num"])
score_sent = pipe_sent.predict_proba(test["text"])[:,1]
