{ "cells": [ { "cell_type": "markdown", "id": "07463a5f", "metadata": {}, "source": [ "# ナイーブベイズ分類器\n", "\n", "ナイーブベイスについては以下を参照してください。\n", "[https://scikit-learn.org/stable/modules/naive_bayes.html](https://scikit-learn.org/stable/modules/naive_bayes.html)" ] }, { "cell_type": "markdown", "id": "22ae72e6", "metadata": {}, "source": [ "**データとモジュールのロード**" ] }, { "cell_type": "code", "execution_count": 1, "id": "4f42c570", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from sklearn import model_selection\n", "\n", "data = pd.read_csv(\"input/pn_same_judge_preprocessed.csv\")\n", "train, test = model_selection.train_test_split(data, test_size=0.1, random_state=0)" ] }, { "cell_type": "code", "execution_count": 2, "id": "55fdf55a", "metadata": {}, "outputs": [], "source": [ "from sklearn.pipeline import Pipeline\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.metrics import ConfusionMatrixDisplay\n", "from sklearn.metrics import PrecisionRecallDisplay" ] }, { "cell_type": "markdown", "id": "310d666d", "metadata": {}, "source": [ "## MultinomialNB" ] }, { "cell_type": "markdown", "id": "ea30df74", "metadata": {}, "source": [ "[sklearn.naive_bayes.MultinomialNB](https://scikit-learn.org/stable/modules/naive_bayes.html)\n", "を使います。" ] }, { "cell_type": "code", "execution_count": 3, "id": "79cc90a0", "metadata": {}, "outputs": [], "source": [ "from sklearn.naive_bayes import MultinomialNB" ] }, { "cell_type": "code", "execution_count": 4, "id": "bb3dbb21", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Pipeline(steps=[('vect',\n",
       "                 TfidfVectorizer(tokenizer=<method 'split' of 'str' objects>)),\n",
       "                ('clf', MultinomialNB())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "Pipeline(steps=[('vect',\n", " TfidfVectorizer(tokenizer=)),\n", " ('clf', MultinomialNB())])" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipe_nb = Pipeline([\n", " (\"vect\", TfidfVectorizer(tokenizer=str.split)),\n", " (\"clf\", MultinomialNB())\n", "])\n", "\n", "pipe_nb.fit(train[\"tokens\"], train[\"label_num\"])" ] }, { "cell_type": "code", "execution_count": 5, "id": "441429d6", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "pred_nb = pipe_nb.predict(test[\"tokens\"])\n", "ConfusionMatrixDisplay.from_predictions(y_true=test[\"label_num\"], y_pred=pred_nb)" ] }, { "cell_type": "code", "execution_count": 6, "id": "dd63ab0c", "metadata": {}, "outputs": [], "source": [ "score_nb = pipe_nb.predict_proba(test[\"tokens\"])[:,1]" ] }, { "cell_type": "code", "execution_count": 7, "id": "2b58a9c2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "PrecisionRecallDisplay.from_predictions(\n", " y_true=test[\"label_num\"],\n", " y_pred=score_nb,\n", " name=\"Naive Bayes\",\n", ")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.6" } }, "nbformat": 4, "nbformat_minor": 5 }