From 6bac0a7e57801e15696a2ccd24f71f7cbebbaf29 Mon Sep 17 00:00:00 2001 From: NickVernal Date: Sun, 17 Dec 2017 21:51:01 +0500 Subject: [PATCH] sorry for shitcode --- dz3/NikitaNuykin/task3.ipynb | 641 +++++++++++++++++++++++++++++++++++ 1 file changed, 641 insertions(+) create mode 100644 dz3/NikitaNuykin/task3.ipynb diff --git a/dz3/NikitaNuykin/task3.ipynb b/dz3/NikitaNuykin/task3.ipynb new file mode 100644 index 0000000..faf09cd --- /dev/null +++ b/dz3/NikitaNuykin/task3.ipynb @@ -0,0 +1,641 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "import pymorphy2\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n", + "import pymystem3\n", + "from sklearn.cross_validation import train_test_split\n", + "from sklearn.metrics import classification_report, roc_auc_score\n", + "from sklearn.naive_bayes import BernoulliNB, MultinomialNB\n", + "from stop_words import get_stop_words\n", + "from sklearn.feature_selection import SelectKBest, SelectPercentile, f_classif, chi2, SelectFromModel" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "morph = pymorphy2.MorphAnalyzer()" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [], + "source": [ + "def string_to_trigramm(string):\n", + " for item in string.split():\n", + " for i in range(len(item)-3):\n", + " yield item[i:i+4]" + ] + }, + { + "cell_type": "code", + "execution_count": 149, + "metadata": {}, + "outputs": [], + "source": [ + "def get_data(file, answer=True):\n", + " X = []\n", + " Y = []\n", + " cache = {}\n", + " \n", + " pattern = re.compile('([^\\s\\w]|_)+')\n", + "\n", + " with open(file) as f:\n", + " content = f.readlines()[1:]\n", + "\n", + " for line in content:\n", + " line_data = line.split(',')\n", + " temp = line_data[1].lower()\n", + " temp1 = temp\n", + " \n", + " temp = re.sub(r'[^а-я ]+', '', temp)\n", + " temp = ' '.join([x for x in temp.split(' ')[1:-1] if len(x) > 2 and len(x) < 15])\n", + " temp = ' '.join([morph.parse(x)[0].normal_form for x in temp.split(' ') if x])\n", + " for x in get_stop_words('ru'):\n", + " temp = re.sub(r\"\\b%s\\b\" % x , '', temp)\n", + " temp += ' '.join([x for x in string_to_trigramm(temp)])\n", + "\n", + " X.append(temp1 + temp)\n", + " if answer:\n", + " Y.append(int(line_data[2][:-1]))\n", + " return X, Y" + ] + }, + { + "cell_type": "code", + "execution_count": 150, + "metadata": {}, + "outputs": [], + "source": [ + "def write_data(f_name, result):\n", + " with open(f_name,'w') as file:\n", + " file.write('id,prob\\n')\n", + " id = 0\n", + "\n", + " for i in range(len(result)):\n", + " line = '{0},{1}\\n'.format(id,result[i])\n", + " id+=1\n", + " file.write(line)" + ] + }, + { + "cell_type": "code", + "execution_count": 151, + "metadata": {}, + "outputs": [], + "source": [ + "X, Y = get_data('data3/train.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 207, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'ся оправдать иру? или… позвонил ей на работу сказал поедет к нотариусу надо довнести картошку он почистил. ира повозмущалась потом сдалась. все равно тебя вылечу только и произнесла негромко. он повесил трубку быстро собрался. по дороге перебирая в кармане купюры вдруг вспомнил — их с пумкой фотографировал егор как раз весной в начале мая. было холодно. у него должны сохраниться фотографии он воправдать ир позвонить работа поехать нотариус довнести картошка почистить ир повозмущаться сдаться равно вылечить произнести негромко повесить трубка быстро собраться дорога перебирать карман купюра вспомнить пумка фотографировать егор весной начало май холодный должный сохраниться фотографияопра прав равд авда вдат дать позв озво звон вони онит нить рабо абот бота поех оеха ехат хать нота отар тари ариу риус довн овне внес нест ести карт арто ртош тошк ошка почи очис чист исти стит тить пово овоз возм озму змущ муща ущат щать атьс ться сдат дать атьс ться равн авно выле ылеч лечи ечит чить прои роиз оизн изне знес нест ести негр егро гром ромк омко пове овес веси есит сить труб рубк убка быст ыстр стро собр обра брат рать атьс ться доро орог рога пере ереб реби ебир бира ират рать карм арма рман купю упюр пюра вспо спом помн омни мнит нить пумк умка фото отог тогр огра граф рафи афир фиро иров рова оват вать егор весн есно сной нача ачал чало холо олод лодн одны дный долж олжн лжны жный сохр охра хран рани анит нить итьс ться фото отог тогр огра граф рафи афия'" + ] + }, + "execution_count": 207, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X[2]" + ] + }, + { + "cell_type": "code", + "execution_count": 199, + "metadata": {}, + "outputs": [], + "source": [ + "tfidf_vectorizer = TfidfVectorizer(min_df=3, max_df=0.7)\n", + "X_tfidf = tfidf_vectorizer.fit_transform(X)" + ] + }, + { + "cell_type": "code", + "execution_count": 200, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(9760, 64486)" + ] + }, + "execution_count": 200, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train, X_test, Y_train, Y_test = train_test_split(X_tfidf, Y, train_size=0.7)\n", + "X_train.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 201, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(9760, 5000)" + ] + }, + "execution_count": 201, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sel = SelectKBest(chi2, k=5000)\n", + "sel.fit(X_train, Y_train)\n", + "X_train = sel.transform(X_train).toarray()\n", + "X_test = sel.transform(X_test).toarray()\n", + "X_train.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 202, + "metadata": {}, + "outputs": [], + "source": [ + "len(sel.scores_)\n", + "10scores = sel.scores_\n", + "scores.sort()" + ] + }, + { + "cell_type": "code", + "execution_count": 203, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA2oAAAEyCAYAAACLaSO4AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAAGIZJREFUeJzt3XusZVd9H/Dvjxm/efh1edmYMQVc\nuYiCewFTh0dwAAeQnUhEMiqNk9CMQiElbSIEtUJSKX/QFKWBEAVNjGMgLgQcAxYiBRdILRJjGGMD\nfoGNITDUZq4x4eW3Z/WPsy++vsx4Zu45d86653w+0tbee+199vn5LmvP/d69zjrVWgsAAAD9eNi0\nCwAAAODBBDUAAIDOCGoAAACdEdQAAAA6I6gBAAB0RlADAADojKAGAADQGUENAACgM4IaAABAZzYf\nyDc79thj25YtWw7kWwIAAHTjyiuvvK21trC38w5oUNuyZUu2b99+IN8SAACgG1X1T/tynqGPAAAA\nnRHUAAAAOiOoAQAAdGavQa2qzq+qnVV1zar2366qG6rq2qr64/UrEQAAYL7syxO1C5KcsbKhqn4+\nyVlJ/nVr7V8ledvkSwMAAJhPew1qrbXLkty+qvm1Sd7aWrt7OGfnOtQGAAAwl9b6GbWnJnleVV1R\nVf+3qp41yaIAAADm2Vq/R21zkqOTnJrkWUk+WFVPaq211SdW1dYkW5PkhBNOWGudAAAAc2OtT9R2\nJLm4jXw+ya4kx+7uxNbattbaYmttcWFhr1/ADQAAMPfWGtQ+kuTnk6Sqnprk4CS3TaooAACANfvg\nB5NPfWraVYxlX6bnf3+Sy5OcVFU7quo1Sc5P8qRhyv4PJDlnd8MeAQAADrjf//3kvPOmXcVY9voZ\ntdbaq/Zw6NUTrgUAAICsfegjAAAA60RQAwAA6IygBgAA0BlBDQAAoDOCGgAAQGcENQAAgM4IagAA\nAJ0R1AAAADojqAEAAHRGUAMAAOiMoAYAANAZQQ0AAKAzghoAAEBnBDUAAIDOCGoAAACdEdQAAAA6\nI6gBAAB0RlADAABmS2vTrmBsghoAADB7qqZdwVgENQAAgM4IagAAAJ0R1AAAADojqAEAAHRmr0Gt\nqs6vqp1Vdc1ujv1uVbWqOnZ9ygMAAJg/+/JE7YIkZ6xurKonJHlJkm9NuCYAAIC5tteg1lq7LMnt\nuzn0P5O8McnG/5ICAACAjqzpM2pVdVaS77TWvrQP526tqu1VtX1paWktbwcAADBX9juoVdXhSf5r\nkrfsy/mttW2ttcXW2uLCwsL+vh0AAMDcWcsTtX+R5MQkX6qqbyY5PskXq+qxkywMAABgXm3e3xe0\n1r6S5NHL+0NYW2yt3TbBugAAAObWvkzP//4klyc5qap2VNVr1r8sAACA+bXXJ2qttVft5fiWiVUD\nAADA2mZ9BAAAYP0IagAAAJ0R1AAAADojqAEAAHRGUAMAAOiMoAYAAMyW1qZdwdgENQAAYPZUTbuC\nsQhqAAAAnRHUAAAAOiOoAQAAdEZQAwAA6IygBgAA0BlBDQAAoDOCGgAAQGcENQAAgM4IagAAAJ0R\n1AAAADojqAEAAHRGUAMAAOiMoAYAANAZQQ0AAKAzghoAAEBn9hrUqur8qtpZVdesaPsfVXVDVX25\nqj5cVUeub5kAAADzY1+eqF2Q5IxVbZcmeVpr7elJvpbkzROuCwAAYG7tNai11i5Lcvuqtk+21u4b\ndj+X5Ph1qA0AAGAuTeIzar+R5O/2dLCqtlbV9qravrS0NIG3AwAAeAitTbuCsY0V1Krq3CT3Jblw\nT+e01ra11hZba4sLCwvjvB0AAMC+qZp2BWPZvNYXVtWvJXlFktNbm4HICgAA0Ik1BbWqOiPJG5O8\noLV2x2RLAgAAmG/7Mj3/+5NcnuSkqtpRVa9J8s4kj0hyaVVdXVXvWuc6AQAA5sZen6i11l61m+Z3\nr0MtAAAAZDKzPgIAADBBghoAAEBnBDUAAIDOCGoAAACdEdQAAAA6I6gBAAB0RlADAADojKAGAADQ\nGUENAACgM4IaAABAZwQ1AACAzghqAAAAnRHUAAAAOiOoAQAAdEZQAwAAZktr065gbIIaAAAwe6qm\nXcFYBDUAAIDOCGoAAACdEdQAAAA6I6gBAAB0RlADAADojKAGAADQmb0Gtao6v6p2VtU1K9qOrqpL\nq+rGYX3U+pYJAAAwP/blidoFSc5Y1famJJ9qrT0lyaeGfQAAACZgr0GttXZZkttXNZ+V5D3D9nuS\n/NKE6wIAAJhba/2M2mNaa7cM27cmecyE6gEAAJh7Y08m0lprSdqejlfV1qraXlXbl5aWxn07AACA\nmbfWoPbdqnpckgzrnXs6sbW2rbW22FpbXFhYWOPbAQAAzI+1BrVLkpwzbJ+T5KOTKQcAAIB9mZ7/\n/UkuT3JSVe2oqtckeWuSF1fVjUl+YdgHAABgAjbv7YTW2qv2cOj0CdcCAABAJjCZCAAAAJMlqAEA\nAHRGUAMAAOiMoAYAAMyWtseved4wBDUAAGD2VE27grEIagAAAJ0R1AAAADojqAEAAHRGUAMAAOiM\noAYAANAZQQ0AAKAzghoAAEBnBDUAAIDOCGoAAACdEdQAAAA6I6gBAAB0RlADAADojKAGAADQGUEN\nAACgM4IaAABAZwQ1AACAzghqAAAAnRHUAAAAOjNWUKuq/1xV11bVNVX1/qo6dFKFAQAArElr065g\nbGsOalV1XJL/lGSxtfa0JJuSnD2pwgAAANasatoVjGXcoY+bkxxWVZuTHJ7k/41fEgAAwHxbc1Br\nrX0nyduSfCvJLUl+0Fr75OrzqmprVW2vqu1LS0trrxQAAGBOjDP08agkZyU5McnjkxxRVa9efV5r\nbVtrbbG1triwsLD2SgEAAObEOEMffyHJN1prS621e5NcnOTfTqYsAACA+TVOUPtWklOr6vCqqiSn\nJ7l+MmUBAADMr3E+o3ZFkouSfDHJV4ZrbZtQXQAAAHNr8zgvbq39QZI/mFAtAAAAZPzp+QEAAJgw\nQQ0AAKAzghoAAEBnBDUAAIDOCGoAAACdEdQAAAA6I6gBAAB0RlADAADojKAGAADQGUENAACgM4Ia\nAABAZwQ1AABgtrQ27QrGJqgBAACzpbWkatpVjEVQAwAAZktrycM2dtTZ2NUDAACstmuXJ2oAAABd\n8UQNAACgM56oAQAAdMZkIgAAAJ0x9BEAAKAzhj4CAAB0xtBHAACAzsz70MeqOrKqLqqqG6rq+qp6\n7qQKAwAAWJMZGPq4eczXvz3J/26tvbKqDk5y+ARqAgAAWLsZeKK25qBWVY9K8vwkv5YkrbV7ktwz\nmbIAAADWaAaeqI0TM09MspTkr6rqqqo6r6qOWH1SVW2tqu1VtX1paWmMtwMAANgHcz6ZyOYkpyT5\ni9baM5P8JMmbVp/UWtvWWltsrS0uLCyM8XYAAAD7YAaGPo5T/Y4kO1prVwz7F2UU3AAAAKZnnoc+\nttZuTfLtqjppaDo9yXUTqQoAAGCtZmDo47izPv52kguHGR9vTvLr45cEAAAwhhkY+jhWUGutXZ1k\ncUK1AAAAjG+ehz4CAAB0aQaeqG3s6gEAAFbzRA0AAKAzMzCZiKAGAADMFkMfAQAAOtLaaO2JGgAA\nQCcENQAAgM4sBzVDHwEAADqxa9do7YkaAABAJwx9BAAA6MzyEzVDHwEAADrhiRoAAEBnTCYCAADQ\nGZOJAAAAdMbQRwAAgM4Y+ggAANAZQx8BAAA6Y+gjAABAZ3yPGgAAQGc8UQMAAOiMyUQAAAA6YzIR\nAACAzhj6OFJVm6rqqqr62CQKAgAAWDNDH3/qDUmun8B1AAAAxmPoY1JVxyd5eZLzJlMOAADAGAx9\nTJL8aZI3Jtk1gVoAAADGM+9DH6vqFUl2ttau3Mt5W6tqe1VtX1paWuvbAQAA7J2hjzktyZlV9c0k\nH0jyoqr669Untda2tdYWW2uLCwsLY7wdAADAXsz7E7XW2ptba8e31rYkOTvJp1trr55YZQAAAPvL\nEzUAAIDOzMhkIpsncZHW2t8n+ftJXAsAAGDN5n3oIwAAQHcMfQQAAOjMjAx9FNQAAIDZYegjAABA\nZwx9BAAA6IwnagAAAJ3xRA0AAKAzJhMBAADojKGPAAAAnTH0EQAAoDOGPgIAAHTG0EcAAIDO3H//\naC2oAQAAdOLuu0frQw+dbh1jEtQAAIDZcdddo7WgBgAA0AlBDQAAoDOCGgAAQGcENQAAgM4sB7VD\nDpluHWMS1AAAgNlx552j9eGHT7eOMQlqAADA7LjjjtFaUAMAAOjEHXckVYY+AgAAdOOHP0yOOGIU\n1jYwQQ0AAJgdS0vJox897SrGtuagVlVPqKrPVNV1VXVtVb1hkoUBAADst5075zuoJbkvye+21k5O\ncmqS11XVyZMpCwAAYA3mPai11m5prX1x2P5RkuuTHDepwgAAAPbbvAe1lapqS5JnJrliN8e2VtX2\nqtq+tLQ0ibcDAAD4WXfdNfqM2mMfO+1KxjZ2UKuqhyf52yS/01r74erjrbVtrbXF1triwsLCuG8H\nAACwe9u3J/ffnywuTruSsY0V1KrqoIxC2oWttYsnUxIAAMAafPSjyaZNyWmnTbuSsY0z62MleXeS\n61trfzK5kgAAAPbTrl3J+96XvOIVybHHTruasY3zRO20JP8+yYuq6uphedmE6gIAANh3N9yQfPe7\nyZlnTruSidi81he21j6bZGN/3TcAADAbrr56tH72s6dbx4RMZNZHAACAqbr66uTgg5OTTpp2JRMh\nqAEAABvfF7+YPO1pyUEHTbuSiRDUAACAjW3XruTzn0+e85xpVzIxghoAALCx3Xhj8qMfJc961rQr\nmRhBDQAA2NguuWS0ft7zplvHBAlqAADAxnX//ckFF4xme3zyk6ddzcSseXp+AACAqfv4x5Prrkve\n+95pVzJRnqgBAAAb03XXJb/5m8lTn5q88pXTrmaiBDUAAGBjaS35yEeS005L7rsv+fCHk8MOm3ZV\nEyWoAQAAG0NryWWXJS98YfLLv5xs2ZL84z8mJ5887comTlADAAD6d9llo1kdX/CC5GtfS975zuTy\ny0fDHmeQoAYAAPRr+/bkpS8dBbRvfCP5sz9Lbroped3rkkMPnXZ168asjwAAQH+uuy75wz9MPvSh\n5Oijk7e9LXnta5PDD592ZQeEoAYAAPThRz8aTRLyvvcll146CmVveUvye7+XPOIR067ugBLUAACA\n6br55uQd70guvDC57bbkhBOSP/qjZOvWZGFh2tVNhaAGAAAcePffn3zyk8nb3z5ab96cnHlm8vrX\nJ89/fvKw+Z5OQ1ADAAAOjJ/8JPmHf0g++tHk4ouTW29NHv/45Nxzk9/6reS446ZdYTcENQAAYLLu\nums0hPHrX0+uumq0XHllcsMNoydphx2WvPzlya/8SnLWWckhh0y74u4IagAAwO61ltxxR/K9742C\n176u77jjwdd57GOTU04ZfUn1aaeNhjbOyeyNayWoAQDAPGgt+fGP9y9wfe97o6dje3LUUckxxyTH\nHjsawvj0pz+wf8wxo0lBnvGMUVBjvwhqAACwkbSW3HPP6PNe3//+nkPW7truvXf316wafVfZcsB6\n4hNHT8CW93e3Puqo0QQgrAs/WQAA2BetjT5fdc89o8CzvKzcX719992jZXn7zjsfernjjr2fc+ed\nya5de65z06ZRmFoOVE9+cnLqqXsOXMcckxx55Oh1dGOsoFZVZyR5e5JNSc5rrb11IlUBALBxLAeY\nvS27dj14/777Hliv3l69f++9D16vblu57K5tOTgtB6nl7T0tu3v9np5GjeOgg0YTaxx22OgzW8vb\nhx02+oLnRz9698cOOyw54ogHDz1cXj/ykXM/tf0sWHNQq6pNSf48yYuT7Ejyhaq6pLV23aSKA4Au\ntfaz2/u73tOxXbtG6z1tr+X4LF+zlzo2yjUnXcdy+Fr5//W0VY3Cz8pl8+bk4IMfWA466MHbD3/4\nzx5/qGX1OSv3V24fcshof+V6ddjyFIs9GOeJ2rOT3NRauzlJquoDSc5KsrGC2mc/Oxqzmzz0P6Lr\nsT1L196f9Vpe47X7t97T9lr+X5nUa3qpYxI/02lea9rv38s1pvX+bFwPe9joF/iqB7b3tW3c4+Nc\nc9OmvuvctOmBGpe3H2pZfd7mzQ8sK/dXby+HreX17raXF8GHGTFOUDsuybdX7O9I8pzVJ1XV1iRb\nk+SEE04Y4+3WybnnJpddNu0qOFCq1r7eaK/d0/a+tq3Ha3qpY19/7uNe46HW03ptL9eY9vv3co3d\ntU36F+lefqGf5jUBNqB1n0yktbYtybYkWVxc7O9Pkn/5l6MZc5Y91D+i67E9S9fen/U4vxiN8wsR\nAABsAOMEte8kecKK/eOHto3lqU+ddgUAAAAPMs50MF9I8pSqOrGqDk5ydpJLJlMWAADA/FrzE7XW\n2n1V9fokn8hoev7zW2vXTqwyAACAOTXWZ9Raax9P8vEJ1QIAAEDGG/oIAADAOhDUAAAAOiOoAQAA\ndEZQAwAA6IygBgAA0BlBDQAAoDOCGgAAQGeqtXbg3qxqKck/HbA33HfHJrlt2kWwLvTt7NK3s0m/\nzi59O7v07ezSt+vjia21hb2ddECDWq+qantrbXHadTB5+nZ26dvZpF9nl76dXfp2dunb6TL0EQAA\noDOCGgAAQGcEtZFt0y6AdaNvZ5e+nU36dXbp29mlb2eXvp0in1EDAADojCdqAAAAnRHUAAAAOjPX\nQa2qzqiqr1bVTVX1pmnXw+5V1flVtbOqrlnRdnRVXVpVNw7ro4b2qqp3DH365ao6ZcVrzhnOv7Gq\nzlnR/m+q6ivDa95RVXVg/wvnV1U9oao+U1XXVdW1VfWGoV3/bnBVdWhVfb6qvjT07X8b2k+sqiuG\n/vibqjp4aD9k2L9pOL5lxbXePLR/tapeuqLdPXxKqmpTVV1VVR8b9vXrDKiqbw73y6uravvQ5n48\nA6rqyKq6qKpuqKrrq+q5+nYDaK3N5ZJkU5KvJ3lSkoOTfCnJydOuy7Lbvnp+klOSXLOi7Y+TvGnY\nflOS/z5svyzJ3yWpJKcmuWJoPzrJzcP6qGH7qOHY54dza3jtL077v3leliSPS3LKsP2IJF9LcrL+\n3fjL8PN++LB9UJIrhn74YJKzh/Z3JXntsP0fk7xr2D47yd8M2ycP9+dDkpw43Lc3uYdPvX//S5L/\nleRjw75+nYElyTeTHLuqzf14BpYk70nyH4btg5McqW/7X+b5idqzk9zUWru5tXZPkg8kOWvKNbEb\nrbXLkty+qvmsjG46Gda/tKL9vW3kc0mOrKrHJXlpkktba7e31r6f5NIkZwzHHtla+1wb3Wneu+Ja\nrLPW2i2ttS8O2z9Kcn2S46J/N7yhj3487B40LC3Ji5JcNLSv7tvlPr8oyenDX2TPSvKB1trdrbVv\nJLkpo/u3e/iUVNXxSV6e5Lxhv6JfZ5n78QZXVY/K6I/e706S1to9rbV/jr7t3jwHteOSfHvF/o6h\njY3hMa21W4btW5M8ZtjeU78+VPuO3bRzgA1Dop6Z0ZMX/TsDhuFxVyfZmdE/6F9P8s+ttfuGU1b2\nx0/7cDj+gyTHZP/7nPX3p0nemGTXsH9M9OusaEk+WVVXVtXWoc39eOM7MclSkr8ahiyfV1VHRN92\nb56DGjNi+OuN75nYwKrq4Un+NsnvtNZ+uPKY/t24Wmv3t9aekeT4jJ6U/Mspl8SYquoVSXa21q6c\ndi2si59rrZ2S5BeTvK6qnr/yoPvxhrU5o4+Q/EVr7ZlJfpLRUMef0rd9mueg9p0kT1ixf/zQxsbw\n3eFRe4b1zqF9T/36UO3H76adA6SqDsoopF3YWrt4aNa/M2QYYvOZJM/NaAjN5uHQyv74aR8Oxx+V\n5HvZ/z5nfZ2W5Myq+mZGwxJflOTt0a8zobX2nWG9M8mHM/oDi/vxxrcjyY7W2hXD/kUZBTd927l5\nDmpfSPKUYaaqgzP6kPMlU66JfXdJkuXZhs5J8tEV7b86zFh0apIfDI/1P5HkJVV11DCr0UuSfGI4\n9sOqOnX43MSvrrgW62z4mb87yfWttT9ZcUj/bnBVtVBVRw7bhyV5cUafQfxMklcOp63u2+U+f2WS\nTw9/4b0kydk1mj3wxCRPyehD6+7hU9Bae3Nr7fjW2paMfuafbq39u+jXDa+qjqiqRyxvZ3QfvSbu\nxxtea+3WJN+uqpOGptOTXBd927/1mqVkIywZzWrztYw+N3HutOux7LGf3p/kliT3ZvRXoddk9BmH\nTyW5Mcn/SXL0cG4l+fOhT7+SZHHFdX4jow+s35Tk11e0L2b0j9HXk7wzSU37v3leliQ/l9FQiy8n\nuXpYXqZ/N/6S5OlJrhr69pokbxnan5TRL+Q3JflQkkOG9kOH/ZuG409aca1zh/77albMJOYePvU+\nfmEemPVRv27wZejDLw3Ltcs/e/fj2ViSPCPJ9uGe/JGMZm3Ut50vNfxwAQAA6MQ8D30EAADokqAG\nAADQGUENAACgM4IaAABAZwQ1AACAzghqAAAAnRHUAAAAOvP/AfDvaiFXEaI2AAAAAElFTkSuQmCC\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(15,5))\n", + "plt.plot([x for x in range(len(sel.scores_))], sel.scores_, c='red')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 206, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "roc_auc 0.434420069475\n", + "roc_auc 0.964928716167\n", + " precision recall f1-score support\n", + "\n", + " 0 0.90 0.81 0.85 3794\n", + " 1 0.06 0.12 0.08 390\n", + "\n", + "avg / total 0.82 0.74 0.78 4184\n", + "\n" + ] + } + ], + "source": [ + "clf_bnb = BernoulliNB(alpha=0.0001)\n", + "clf_bnb.fit(X_train, Y_train)\n", + "Y_pred = clf_bnb.predict(X_test)\n", + "Y_pred_proba = clf_bnb.predict_proba(X_test)\n", + "Y_pred_prob = clf_bnb.predict_proba(X_train)\n", + "print('roc_auc', roc_auc_score(y_true=Y_test, y_score=Y_pred_proba[:,1]))\n", + "print('roc_auc', roc_auc_score(y_true=Y_train, y_score=Y_pred_prob[:,1]))\n", + "print(classification_report(y_true=Y_test, y_pred=Y_pred))" + ] + }, + { + "cell_type": "code", + "execution_count": 193, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "roc_auc 0.475252471397\n", + "roc_auc 0.958485046357\n", + " precision recall f1-score support\n", + "\n", + " 0 0.90 1.00 0.95 3760\n", + " 1 0.00 0.00 0.00 424\n", + "\n", + "avg / total 0.81 0.90 0.85 4184\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/nikita/work/ml/py3/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n", + " 'precision', 'predicted', average, warn_for)\n" + ] + } + ], + "source": [ + "clf_mnb = MultinomialNB(alpha=1)\n", + "clf_mnb.fit(X_train, Y_train)\n", + "Y_pred = clf_mnb.predict(X_test)\n", + "Y_pred_proba = clf_mnb.predict_proba(X_test)\n", + "Y_pred_prob = clf_bnb.predict_proba(X_train)\n", + "print('roc_auc', roc_auc_score(y_true=Y_test, y_score=Y_pred_proba[:,1]))\n", + "print('roc_auc', roc_auc_score(y_true=Y_train, y_score=Y_pred_prob[:,1]))\n", + "print(classification_report(y_true=Y_test, y_pred=Y_pred))" + ] + }, + { + "cell_type": "code", + "execution_count": 197, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "варианта -3.57277769662\n", + "богиня -3.57277769662\n", + "бумажка -3.57277769662\n", + "бвор -3.57277769662\n", + "адекватный -3.57277769662\n", + "влия -3.57277769662\n", + "бвен -3.57277769662\n", + "база -3.57277769662\n", + "абсурд -3.57277769662\n", + "бухгалтерии -3.57277769662\n", + "блестеть -3.57277769662\n", + "азён -3.57277769662\n", + "вожд -3.57277769662\n", + "бинты -3.53503736864\n", + "возьмет -3.53503736864\n", + "вассал -3.53503736864\n", + "бинн -3.53503736864\n", + "вона -3.53503736864\n", + "вернется -3.53503736864\n", + "анив -3.53503736864\n", + "апри -3.53503736864\n", + "ачас -3.53503736864\n", + "взяло -3.53503736864\n", + "взрезать -3.53503736864\n", + "абсу -3.53503736864\n", + "браться -3.53503736864\n", + "благополучно -3.53503736864\n", + "вира -3.49866972447\n", + "бродил -3.49866972447\n", + "агия -3.49866972447\n", + "варька -3.49866972447\n", + "вздохи -3.49866972447\n", + "взмыла -3.46357840466\n", + "анальгин -3.46357840466\n", + "бархатный -3.46357840466\n", + "ажем -3.46357840466\n", + "бумажные -3.46357840466\n", + "вовка -3.46357840466\n", + "влиятельный -3.46357840466\n", + "60 -3.46357840466\n", + "васька -3.46357840466\n", + "бешеной -3.46357840466\n", + "атра -3.46357840466\n", + "бесполезный -3.46357840466\n", + "банку -3.46357840466\n", + "видное -3.46357840466\n", + "больший -3.46357840466\n", + "анит -3.46357840466\n", + "академия -3.42967685298\n", + "асно -3.42967685298\n", + "барт -3.42967685298\n", + "актр -3.42967685298\n", + "водка -3.42967685298\n", + "вешал -3.42967685298\n", + "васс -3.42967685298\n", + "алеж -3.42967685298\n", + "алев -3.42967685298\n", + "анис -3.42967685298\n", + "буча -3.42967685298\n", + "американский -3.39688703016\n", + "бнит -3.39688703016\n", + "text -3.39688703016\n", + "августа -3.39688703016\n", + "арск -3.39688703016\n", + "битву -3.39688703016\n", + "арищ -3.39688703016\n", + "бритв -3.36513833185\n", + "влияние -3.36513833185\n", + "брезгливый -3.36513833185\n", + "астм -3.36513833185\n", + "безумец -3.36513833185\n", + "анастасий -3.36513833185\n", + "амоп -3.36513833185\n", + "азик -3.36513833185\n", + "вкусно -3.33436667318\n", + "алевтина -3.33436667318\n", + "волх -3.33436667318\n", + "вечно -3.33436667318\n", + "азал -3.33436667318\n", + "айдж -3.33436667318\n", + "взой -3.33436667318\n", + "асад -3.30451371003\n", + "весенний -3.30451371003\n", + "батарею -3.30451371003\n", + "беть -3.30451371003\n", + "билось -3.30451371003\n", + "алах -3.30451371003\n", + "68 -3.30451371003\n", + "вкусная -3.30451371003\n", + "витр -3.30451371003\n", + "агонии -3.27552617316\n", + "васт -3.27552617316\n", + "абго -3.27552617316\n", + "видит -3.27552617316\n", + "будут -3.24735529619\n", + "были -3.24735529619\n", + "бестолково -3.24735529619\n", + "алез -3.24735529619\n", + "абстрактный -3.24735529619\n", + "вайн -3.219956322\n", + "бьёт -3.219956322\n", + "визн -3.19328807492\n", + "анфи -3.19328807492\n", + "азие -3.19328807492\n", + "версию -3.16731258852\n", + "бомб -3.16731258852\n", + "бним -3.16731258852\n", + "апуг -3.16731258852\n", + "бронь -3.14199478053\n", + "ачай -3.14199478053\n", + "башмак -3.14199478053\n", + "благородство -3.14199478053\n", + "блег -3.14199478053\n", + "ванны -3.14199478053\n", + "ашмя -3.14199478053\n", + "ахнуть -3.14199478053\n", + "2007 -3.11730216794\n", + "ауди -3.11730216794\n", + "арих -3.11730216794\n", + "90 -3.11730216794\n", + "ванную -3.11730216794\n", + "адне -3.11730216794\n", + "амулет -3.11730216794\n", + "большей -3.09320461636\n", + "бритвенное -3.02421174488\n", + "ачин -3.02421174488\n", + "важная -2.98072663294\n", + "атво -2.98072663294\n", + "бледно -2.98072663294\n", + "арый -2.95967322374\n", + "бросил -2.95967322374\n", + "борька -2.95967322374\n", + "биографии -2.95967322374\n", + "видения -2.89904860192\n", + "верху -2.87963051606\n", + "беспорядок -2.87963051606\n", + "атну -2.87963051606\n", + "анкр -2.86058232109\n", + "анча -2.82354104941\n", + "блестели -2.7704312241\n", + "виска -2.7704312241\n", + "анцу -2.67199115129\n", + "анде -2.64121949262\n", + "веде -2.5681943576\n", + "вдохновение -2.20590142136\n", + "бийц -2.16743514053\n", + "афон -1.94052510642\n", + "адме -1.89640030152\n", + "веще -1.34193650849\n", + "асот -0.776434888377\n" + ] + } + ], + "source": [ + "iverted_vocab = {_id:w for (w,_id) in tfidf_vectorizer.vocabulary_.items()}\n", + "for _id in clf_bnb.coef_.argsort()[0][-150:]:\n", + " print(iverted_vocab[_id], clf_bnb.coef_[0][_id])" + ] + }, + { + "cell_type": "code", + "execution_count": 198, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "бедняжка -7.99514040352\n", + "водка -7.99495245644\n", + "абго -7.99402267007\n", + "вовка -7.99109595401\n", + "большой -7.99067482773\n", + "васька -7.9887626459\n", + "виновник -7.9883777705\n", + "ас -7.98752006675\n", + "беременности -7.98752006675\n", + "ател -7.98691061432\n", + "битву -7.98641209628\n", + "вешал -7.98174593528\n", + "великое -7.9798818648\n", + "беременный -7.97453997456\n", + "брот -7.9739442211\n", + "анастасий -7.97312946655\n", + "варька -7.97099704393\n", + "анис -7.9651802522\n", + "актр -7.96515242536\n", + "text -7.95933271608\n", + "волх -7.95539296972\n", + "взой -7.95444032324\n", + "бвен -7.95187632965\n", + "астм -7.95002345557\n", + "вассал -7.94776553345\n", + "васс -7.94518297901\n", + "амоп -7.93597259846\n", + "вишн -7.93474847348\n", + "вкусная -7.93388854655\n", + "бдел -7.92938332542\n", + "анфи -7.92882496973\n", + "алев -7.92731097393\n", + "вздр -7.92603727778\n", + "веселиться -7.92038743085\n", + "атуш -7.91630989866\n", + "агонии -7.91240051128\n", + "булыжник -7.91133289701\n", + "вкусно -7.90588848578\n", + "визн -7.90558765434\n", + "алевтина -7.89884007968\n", + "брезгливый -7.89266117703\n", + "билось -7.88932348437\n", + "вечно -7.88901355514\n", + "алез -7.88682018991\n", + "арич -7.88519141941\n", + "ампула -7.88397355284\n", + "90 -7.88236202844\n", + "батарею -7.88099473669\n", + "азие -7.87939551724\n", + "атчи -7.87779606074\n", + "асад -7.8751875004\n", + "68 -7.86826305203\n", + "алуб -7.86698253959\n", + "арск -7.86688409958\n", + "банда -7.86680055201\n", + "вернулся -7.86155331678\n", + "абсу -7.85685900128\n", + "влиятельный -7.85614590255\n", + "азал -7.8551375543\n", + "браться -7.8487753315\n", + "взмыла -7.84170530792\n", + "беть -7.84116678864\n", + "бизнес -7.83929313098\n", + "балконе -7.8348980129\n", + "алису -7.83433853983\n", + "волне -7.83221226166\n", + "асно -7.83168690786\n", + "бритв -7.82764257657\n", + "ашмя -7.82426601829\n", + "арих -7.82325784387\n", + "бешеной -7.82219548517\n", + "ахнуть -7.81989300079\n", + "безумец -7.8192673018\n", + "африке -7.81720692268\n", + "благородство -7.81667068401\n", + "версию -7.81253810271\n", + "башмак -7.80842568141\n", + "атли -7.80712598324\n", + "влияние -7.80551135895\n", + "адал -7.8040461867\n", + "абстрактный -7.80158406264\n", + "апри -7.79640961952\n", + "виде -7.79360589469\n", + "2007 -7.79259492395\n", + "акту -7.79013335213\n", + "биографии -7.78423947134\n", + "блег -7.78073666286\n", + "апуг -7.78005852692\n", + "айдж -7.77942024446\n", + "будут -7.77885382785\n", + "веревкой -7.77681033311\n", + "алиб -7.76715446459\n", + "ачай -7.76343314217\n", + "ванны -7.76260203984\n", + "бним -7.76006574756\n", + "амулет -7.74446081852\n", + "адне -7.74446081852\n", + "ванную -7.74160576753\n", + "бронь -7.73389718971\n", + "бьёт -7.72013997552\n", + "витр -7.71441976906\n", + "атет -7.70555326833\n", + "ачин -7.70344173359\n", + "алиф -7.69977260777\n", + "важная -7.69621007901\n", + "атем -7.69571467882\n", + "алах -7.68346407125\n", + "бледно -7.6687176387\n", + "быти -7.65758516137\n", + "борька -7.65301092741\n", + "варианта -7.64663831778\n", + "арый -7.64404717304\n", + "большей -7.6322075262\n", + "воздушный -7.62749946518\n", + "беспорядок -7.62716740314\n", + "весенний -7.62352298951\n", + "взрезать -7.61807974546\n", + "видения -7.61728182725\n", + "верху -7.60319110749\n", + "анкр -7.59881554768\n", + "азён -7.58984231606\n", + "анде -7.58887767968\n", + "азик -7.57857416501\n", + "блестели -7.5687864771\n", + "арищ -7.56779518238\n", + "витя -7.55775558988\n", + "ачли -7.54964362504\n", + "бросил -7.54105967348\n", + "бомб -7.52777920551\n", + "больший -7.52144154966\n", + "атво -7.5188307602\n", + "виска -7.51508033129\n", + "бестолково -7.50794211673\n", + "взмыть -7.50645825343\n", + "бритвенное -7.50217475761\n", + "анча -7.4836001499\n", + "адёж -7.46786435925\n", + "были -7.45689735864\n", + "анцу -7.44624471528\n", + "атну -7.43075516979\n", + "веде -7.41848350234\n", + "бнит -7.40752600948\n", + "ауди -7.4009830761\n", + "видит -7.29611083527\n", + "вдохновение -7.29216514524\n", + "бийц -7.2821534349\n", + "афон -7.10159639094\n", + "адме -7.04469375407\n", + "веще -6.41174816125\n", + "асот -6.32784758986\n" + ] + } + ], + "source": [ + "iverted_vocab = {_id:w for (w,_id) in tfidf_vectorizer.vocabulary_.items()}\n", + "for _id in clf_mnb.coef_.argsort()[0][-150:]:\n", + " print(iverted_vocab[_id], clf_mnb.coef_[0][_id])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}