interactive labeling update

master
Anne Lorenz 2019-02-18 10:02:47 +01:00
parent 146a292914
commit 943c24cef0
14 changed files with 16487 additions and 2981 deletions

File diff suppressed because one or more lines are too long

View File

Can't render this file because it is too large.

Binary file not shown.

File diff suppressed because one or more lines are too long

View File

@ -225,7 +225,7 @@
],
"source": [
"# read current data set from csv\n",
"df = pd.read_csv('../data/interactive_labeling_round_{}.csv'.format(m),\n",
"df = pd.read_csv('../data/interactive_labeling_round_{}_corrected.csv'.format(m),\n",
" sep='|',\n",
" usecols=range(1,13), # drop first column 'unnamed'\n",
" encoding='utf-8',\n",
@ -414,11 +414,9 @@
"metadata": {},
"outputs": [],
"source": [
"# THIS CELL IS OPTIONAL\n",
"\n",
"# read current data set from csv\n",
"m = \n",
"df = pd.read_csv('../data/interactive_labeling_round_{}_temp.csv'.format(m),\n",
"m = 9\n",
"df = pd.read_csv('../data/interactive_labeling_round_{}_corrected.csv'.format(m),\n",
" sep='|',\n",
" usecols=range(1,13), # drop first column 'unnamed'\n",
" encoding='utf-8',\n",

View File

@ -4,7 +4,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"# Jupyter Notebook for Interactive Labeling\n",
"# Jupyter Notebook for Labeling Analysis\n",
"______\n",
"\n",
"This Jupyter Notebook is only for data analysis.\n",
@ -14,7 +14,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 14,
"metadata": {
"scrolled": true
},
@ -25,18 +25,19 @@
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.feature_selection import SelectPercentile\n",
"from sklearn.metrics import recall_score, precision_score, f1_score, make_scorer\n",
"from sklearn.model_selection import GridSearchCV\n",
"from sklearn.model_selection import StratifiedKFold\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn.semi_supervised import label_propagation\n",
"\n",
"from BagOfWords import BagOfWords\n",
"from MNBInteractive import MNBInteractive"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We load the previously created dictionary of all article indices (keys) with a list of mentioned organizations (values).\n",
"In the following, we limit the number of occurences of a certain company name in all labeled articles to 3 to avoid imbalance."
]
},
{
"cell_type": "markdown",
"metadata": {},
@ -46,7 +47,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@ -55,7 +56,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 3,
"metadata": {},
"outputs": [
{
@ -70,7 +71,7 @@
],
"source": [
"# read current data set from csv\n",
"df = pd.read_csv('../data/interactive_labeling_round_{}.csv'.format(m),\n",
"df = pd.read_csv('../data/interactive_labeling_round_{}_corrected.csv'.format(m),\n",
" sep='|',\n",
" usecols=range(1,13), # drop first column 'unnamed'\n",
" encoding='utf-8',\n",
@ -93,7 +94,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@ -103,7 +104,18 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# discard old indices\n",
"y_train_test = y_train_test.reset_index(drop=True)\n",
"X_train_test = X_train_test.reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
@ -129,9 +141,20 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 8,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"# BOW: calculating matrix...\n",
"\n",
"# BOW: calculating frequencies...\n",
"\n"
]
}
],
"source": [
"# fit the training data and return the matrix\n",
"training_data = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming)\n",
@ -145,7 +168,339 @@
"outputs": [],
"source": [
"# Naive Bayes\n",
"classifier = MultinomialNB(alpha=1.0e-10, fit_prior=False, class_prior=None)"
"#classifier = MultinomialNB(alpha=1.0e-10, fit_prior=False, class_prior=None)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Grid search for Label Propagation:"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"# use stratified k-fold cross-validation as split method\n",
"skf = StratifiedKFold(n_splits = 10, shuffle=True)\n",
"\n",
"# use only most important features\n",
"selector = SelectPercentile()\n",
"\n",
"pipeline = Pipeline([('perc', selector), ('LP', label_propagation.LabelSpreading())])\n",
"\n",
"grid = GridSearchCV(pipeline, {'perc__percentile': [100],\n",
" 'LP__kernel': ['knn', 'rbf'],\n",
" 'LP__gamma': [20],\n",
" 'LP__n_neighbors': [7],\n",
" 'LP__alpha': [0.2],\n",
" 'LP__max_iter': [30],\n",
" 'LP__tol': [0.001],\n",
" 'LP__n_jobs': [None]},\n",
" cv=skf,\n",
" scoring=make_scorer(f1_score))"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Anne\\Anaconda3\\lib\\site-packages\\sklearn\\feature_selection\\univariate_selection.py:114: UserWarning: Features [ 16 38 56 112 152 158 171 198 239 252 266 274\n",
" 305 308 334 345 359 368 377 382 393 402 425 508\n",
" 528 534 550 566 580 607 623 663 687 717 718 720\n",
" 730 731 743 765 780 875 911 936 961 968 969 1036\n",
" 1061 1063 1109 1117 1145 1154 1176 1182 1195 1212 1218 1232\n",
" 1233 1259 1262 1307 1311 1317 1336 1378 1383 1514 1524 1538\n",
" 1564 1611 1636 1642 1648 1650 1666 1691 1695 1725 1731 1745\n",
" 1751 1754 1770 1796 1815 1819 1864 1880 1882 1899 1923 1929\n",
" 1931 1963 1977 1985 1986 2025 2039 2044 2060 2086 2115 2121\n",
" 2124 2162 2168 2170 2174 2194 2203 2238 2248 2282 2288 2310\n",
" 2328 2340 2362 2392 2409 2434 2462 2464 2466 2467 2475 2486\n",
" 2504 2505 2510 2511 2541 2546 2560 2561 2571 2608 2615 2634\n",
" 2649 2754 2784 2791 2819 2839 2873 2893 2923 2935 2937 2959\n",
" 2969 2990 2999 3002 3010 3016 3018 3021 3030 3031 3051 3052\n",
" 3069 3073 3086 3114 3118 3134 3140 3148 3181 3191 3213 3217\n",
" 3220 3227 3232 3257 3312 3347 3517 3531 3551 3639 3692 3711\n",
" 3762 3799 3801 3876 3885 3886 3910 3958 3965 3969 3985 3997\n",
" 4011 4057 4096 4123 4135 4153 4154 4160 4169 4188 4201 4202\n",
" 4204 4205 4215 4220 4232 4245 4258 4259 4264 4270 4283 4289\n",
" 4343 4346 4362 4399 4422 4455 4456 4480 4492 4494 4512 4522\n",
" 4570 4592 4594 4603 4607 4649 4658 4687 4759 4768 4773 4810\n",
" 4815 4836 4837 4853 4878 4896 4942 4947 4962 4988 4997 5000\n",
" 5028 5048 5067 5093 5099 5108 5112 5151 5181 5191 5238 5269\n",
" 5282 5286 5305 5310 5314 5372 5373 5387 5405 5441 5486 5512\n",
" 5516 5542 5548 5554 5562 5575 5581 5681 5690 5704 5713 5726\n",
" 5756 5771 5793 5797 5803 5864 5879 5914 5937 5943 5987 5993\n",
" 6011 6048 6055 6069 6073 6082 6108 6125 6126 6130 6135 6167\n",
" 6237 6351 6369 6386 6414 6421 6426 6427 6469 6483 6494 6514\n",
" 6537 6607 6616 6619 6632 6639 6682 6704 6724 6734 6764 6785\n",
" 6799 6811 6839 6842 6847 6864 6877 6878 6905 6944 6947 6987\n",
" 7008 7059 7061 7067 7069 7076 7083 7106 7138 7161 7187 7189\n",
" 7222 7239 7246 7253 7255 7273 7280 7293 7312 7342 7343 7367\n",
" 7372 7402 7407 7410 7413 7439 7485 7495 7505 7508 7514 7523\n",
" 7526 7544 7559 7569 7598 7603 7612 7624 7673 7677 7708 7720\n",
" 7777 7798 7802 7835 7850 7896 7916 7935 7952 7968 7972 7974\n",
" 8006 8010 8020 8077 8098 8116 8202 8203 8216 8225 8258 8287\n",
" 8302 8346 8360 8372 8388 8432 8434 8441 8478 8496 8502 8525\n",
" 8602 8622 8667 8679 8713 8728 8774 8798 8829 8830 8833 8845\n",
" 8867 8888 8911 8924 8942 8987 9042 9100 9141 9145 9177 9189\n",
" 9220 9223 9227 9231 9238 9279 9292 9298 9355 9371 9375 9399\n",
" 9408 9445 9454 9461 9469 9537 9552 9564 9570 9583 9590 9608\n",
" 9670 9682 9735 9769 9789 9808 9809 9811 9825 9854 9862 9913\n",
" 9942 9950 9973 9978 10005 10015 10025 10034 10040 10045 10078 10089\n",
" 10096 10100 10150 10154 10168 10170 10209 10216 10217 10220 10235 10258\n",
" 10263 10294 10299 10308 10318 10322 10341 10374 10379 10382 10410 10477\n",
" 10488 10501 10508 10521 10549 10584 10648 10669 10673 10684 10707 10724\n",
" 10772 10778 10826 10853 10900 10908 10916 10919 10974 10978 11005 11013\n",
" 11023 11049 11052 11062 11063 11078 11088 11155 11163 11165 11219 11221\n",
" 11240 11284 11291 11292 11295 11300 11328 11406 11427 11433 11446 11456\n",
" 11485 11490 11491 11501 11508 11510 11511 11522 11534 11548 11569 11622\n",
" 11666 11695 11699 11719 11757 11769 11795 11819 11824 11848 11849 11884\n",
" 11919 11930 11933 11936 11952 11966 12012 12063 12105 12118 12122 12144\n",
" 12182 12183 12205 12214 12227 12233 12254 12263 12270 12282 12306 12360\n",
" 12371 12378 12397 12415 12459 12484 12491 12494 12502 12518 12535 12544\n",
" 12562 12645 12676 12688 12722 12738 12742 12752 12766 12769 12771 12801\n",
" 12828 12846 12863 12864 12879 12898 12915 12922 12967 12973 12975 12980\n",
" 13011 13013 13018 13024 13027 13030 13031 13037 13080 13092 13127 13144\n",
" 13214 13220 13233 13268 13295 13319 13377 13381 13386 13395 13461 13464\n",
" 13489 13490 13493 13526 13568 13570 13573 13575 13579 13581 13605 13616\n",
" 13628 13662 13664 13704 13709 13741 13782 13815 13824 13828 13829 13859\n",
" 13876 13883 13888 13929 13945 13953 13975 13976 13979 13980 13981 13987\n",
" 13997 14030 14060 14066 14072 14090 14093 14099 14127 14158 14193 14199\n",
" 14270 14299 14308 14333 14352 14353 14390 14394] are constant.\n",
" UserWarning)\n",
"C:\\Users\\Anne\\Anaconda3\\lib\\site-packages\\sklearn\\feature_selection\\univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide\n",
" f = msb / msw\n"
]
},
{
"ename": "ValueError",
"evalue": "Target is multiclass but average='binary'. Please choose another average setting.",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-28-a5c93e384c15>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mgrid\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtraining_data\u001b[0m \u001b[1;33m,\u001b[0m\u001b[0my_train_test\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_search.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, X, y, groups, **fit_params)\u001b[0m\n\u001b[0;32m 720\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mresults_container\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 721\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 722\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_run_search\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mevaluate_candidates\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 723\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 724\u001b[0m \u001b[0mresults\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mresults_container\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_search.py\u001b[0m in \u001b[0;36m_run_search\u001b[1;34m(self, evaluate_candidates)\u001b[0m\n\u001b[0;32m 1189\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_run_search\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mevaluate_candidates\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1190\u001b[0m \u001b[1;34m\"\"\"Search all candidates in param_grid\"\"\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1191\u001b[1;33m \u001b[0mevaluate_candidates\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mParameterGrid\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparam_grid\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1192\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1193\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_search.py\u001b[0m in \u001b[0;36mevaluate_candidates\u001b[1;34m(candidate_params)\u001b[0m\n\u001b[0;32m 709\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mparameters\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mtrain\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtest\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 710\u001b[0m in product(candidate_params,\n\u001b[1;32m--> 711\u001b[1;33m cv.split(X, y, groups)))\n\u001b[0m\u001b[0;32m 712\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 713\u001b[0m \u001b[0mall_candidate_params\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mextend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcandidate_params\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\sklearn\\externals\\joblib\\parallel.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m 915\u001b[0m \u001b[1;31m# remaining jobs.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 916\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_iterating\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mFalse\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 917\u001b[1;33m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdispatch_one_batch\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0miterator\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 918\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_iterating\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_original_iterator\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 919\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\sklearn\\externals\\joblib\\parallel.py\u001b[0m in \u001b[0;36mdispatch_one_batch\u001b[1;34m(self, iterator)\u001b[0m\n\u001b[0;32m 757\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[1;32mFalse\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 758\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 759\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_dispatch\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtasks\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 760\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 761\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\sklearn\\externals\\joblib\\parallel.py\u001b[0m in \u001b[0;36m_dispatch\u001b[1;34m(self, batch)\u001b[0m\n\u001b[0;32m 714\u001b[0m \u001b[1;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_lock\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 715\u001b[0m \u001b[0mjob_idx\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_jobs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 716\u001b[1;33m \u001b[0mjob\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mapply_async\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mbatch\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcallback\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mcb\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 717\u001b[0m \u001b[1;31m# A job can complete so quickly than its callback is\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 718\u001b[0m \u001b[1;31m# called before we get here, causing self._jobs to\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\sklearn\\externals\\joblib\\_parallel_backends.py\u001b[0m in \u001b[0;36mapply_async\u001b[1;34m(self, func, callback)\u001b[0m\n\u001b[0;32m 180\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mapply_async\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcallback\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 181\u001b[0m \u001b[1;34m\"\"\"Schedule a func to be run\"\"\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 182\u001b[1;33m \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mImmediateResult\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 183\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mcallback\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 184\u001b[0m \u001b[0mcallback\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\sklearn\\externals\\joblib\\_parallel_backends.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, batch)\u001b[0m\n\u001b[0;32m 547\u001b[0m \u001b[1;31m# Don't delay the application, to avoid keeping the input\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 548\u001b[0m \u001b[1;31m# arguments in memory\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 549\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mresults\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mbatch\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 550\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 551\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\sklearn\\externals\\joblib\\parallel.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 223\u001b[0m \u001b[1;32mwith\u001b[0m \u001b[0mparallel_backend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mn_jobs\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_n_jobs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 224\u001b[0m return [func(*args, **kwargs)\n\u001b[1;32m--> 225\u001b[1;33m for func, args, kwargs in self.items]\n\u001b[0m\u001b[0;32m 226\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 227\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m__len__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\sklearn\\externals\\joblib\\parallel.py\u001b[0m in \u001b[0;36m<listcomp>\u001b[1;34m(.0)\u001b[0m\n\u001b[0;32m 223\u001b[0m \u001b[1;32mwith\u001b[0m \u001b[0mparallel_backend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mn_jobs\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_n_jobs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 224\u001b[0m return [func(*args, **kwargs)\n\u001b[1;32m--> 225\u001b[1;33m for func, args, kwargs in self.items]\n\u001b[0m\u001b[0;32m 226\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 227\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m__len__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_validation.py\u001b[0m in \u001b[0;36m_fit_and_score\u001b[1;34m(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)\u001b[0m\n\u001b[0;32m 566\u001b[0m \u001b[0mfit_time\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m-\u001b[0m \u001b[0mstart_time\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 567\u001b[0m \u001b[1;31m# _score will return dict if is_multimetric is True\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 568\u001b[1;33m \u001b[0mtest_scores\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_score\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mestimator\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mX_test\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_test\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mscorer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mis_multimetric\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 569\u001b[0m \u001b[0mscore_time\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m-\u001b[0m \u001b[0mstart_time\u001b[0m \u001b[1;33m-\u001b[0m \u001b[0mfit_time\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 570\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mreturn_train_score\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_validation.py\u001b[0m in \u001b[0;36m_score\u001b[1;34m(estimator, X_test, y_test, scorer, is_multimetric)\u001b[0m\n\u001b[0;32m 603\u001b[0m \"\"\"\n\u001b[0;32m 604\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mis_multimetric\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 605\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0m_multimetric_score\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mestimator\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mX_test\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_test\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mscorer\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 606\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 607\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0my_test\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_validation.py\u001b[0m in \u001b[0;36m_multimetric_score\u001b[1;34m(estimator, X_test, y_test, scorers)\u001b[0m\n\u001b[0;32m 633\u001b[0m \u001b[0mscore\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mscorer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mestimator\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mX_test\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 634\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 635\u001b[1;33m \u001b[0mscore\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mscorer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mestimator\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mX_test\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_test\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 636\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 637\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mscore\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'item'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\scorer.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, estimator, X, y_true, sample_weight)\u001b[0m\n\u001b[0;32m 96\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 97\u001b[0m return self._sign * self._score_func(y_true, y_pred,\n\u001b[1;32m---> 98\u001b[1;33m **self._kwargs)\n\u001b[0m\u001b[0;32m 99\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 100\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py\u001b[0m in \u001b[0;36mf1_score\u001b[1;34m(y_true, y_pred, labels, pos_label, average, sample_weight)\u001b[0m\n\u001b[0;32m 718\u001b[0m return fbeta_score(y_true, y_pred, 1, labels=labels,\n\u001b[0;32m 719\u001b[0m \u001b[0mpos_label\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mpos_label\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maverage\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0maverage\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 720\u001b[1;33m sample_weight=sample_weight)\n\u001b[0m\u001b[0;32m 721\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 722\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py\u001b[0m in \u001b[0;36mfbeta_score\u001b[1;34m(y_true, y_pred, beta, labels, pos_label, average, sample_weight)\u001b[0m\n\u001b[0;32m 832\u001b[0m \u001b[0maverage\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0maverage\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 833\u001b[0m \u001b[0mwarn_for\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'f-score'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 834\u001b[1;33m sample_weight=sample_weight)\n\u001b[0m\u001b[0;32m 835\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 836\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py\u001b[0m in \u001b[0;36mprecision_recall_fscore_support\u001b[1;34m(y_true, y_pred, beta, labels, pos_label, average, warn_for, sample_weight)\u001b[0m\n\u001b[0;32m 1045\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1046\u001b[0m raise ValueError(\"Target is %s but average='binary'. Please \"\n\u001b[1;32m-> 1047\u001b[1;33m \"choose another average setting.\" % y_type)\n\u001b[0m\u001b[0;32m 1048\u001b[0m \u001b[1;32melif\u001b[0m \u001b[0mpos_label\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32min\u001b[0m \u001b[1;33m(\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1049\u001b[0m warnings.warn(\"Note that pos_label (set to %r) is ignored when \"\n",
"\u001b[1;31mValueError\u001b[0m: Target is multiclass but average='binary'. Please choose another average setting."
]
}
],
"source": [
"grid.fit(training_data ,y_train_test)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"ename": "TypeError",
"evalue": "f1_score() missing 2 required positional arguments: 'y_true' and 'y_pred'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-29-be9666c7e418>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;31m#recall_score(y_true=, y_pred=, average='None')\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[1;31m#precision_score(average='None')\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0mf1_score\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0maverage\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'None'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 4\u001b[0m \u001b[1;31m#make_scorer(score_func=, average='None')\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mTypeError\u001b[0m: f1_score() missing 2 required positional arguments: 'y_true' and 'y_pred'"
]
}
],
"source": [
"#recall_score(y_true=, y_pred=, average='None')\n",
"#precision_score(average='None')\n",
"#f1_score(y_true=, y_pred=, average='None')\n",
"#make_scorer(score_func=, average='None')"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Anne\\Anaconda3\\lib\\site-packages\\sklearn\\feature_selection\\univariate_selection.py:114: UserWarning: Features [ 20 34 41 68 95 96 100 159 178 217 224 296\n",
" 322 343 352 378 406 422 458 461 488 524 526 533\n",
" 548 574 614 623 629 660 662 671 705 713 717 718\n",
" 724 744 798 804 816 875 898 957 960 1006 1008 1035\n",
" 1044 1059 1074 1099 1105 1146 1267 1299 1366 1376 1402 1449\n",
" 1453 1461 1475 1524 1597 1637 1644 1649 1714 1723 1773 1795\n",
" 1801 1859 1873 1878 1909 1933 1974 1978 2009 2015 2026 2052\n",
" 2058 2060 2091 2115 2139 2214 2236 2257 2266 2281 2289 2298\n",
" 2310 2318 2362 2376 2379 2385 2432 2504 2509 2518 2522 2568\n",
" 2570 2582 2600 2619 2673 2709 2712 2718 2739 2751 2770 2776\n",
" 2801 2823 2829 2839 2857 2868 2869 2876 2915 2929 2939 2942\n",
" 2951 2964 2966 3007 3013 3014 3022 3093 3120 3132 3149 3181\n",
" 3182 3184 3219 3242 3255 3274 3316 3317 3346 3357 3372 3406\n",
" 3420 3422 3443 3451 3477 3550 3579 3594 3595 3641 3651 3671\n",
" 3680 3695 3702 3774 3799 3801 3830 3849 3854 3859 3866 3867\n",
" 3873 3940 3943 3947 3954 3969 4045 4060 4063 4090 4092 4099\n",
" 4100 4123 4128 4145 4146 4155 4181 4183 4184 4203 4246 4247\n",
" 4249 4270 4280 4325 4340 4343 4359 4371 4372 4399 4422 4440\n",
" 4451 4458 4459 4460 4493 4500 4511 4532 4551 4602 4606 4610\n",
" 4624 4655 4676 4686 4708 4712 4737 4816 4838 4843 4845 4855\n",
" 4860 4873 4884 4887 4894 4897 4905 4909 4913 4945 4951 4976\n",
" 4985 5010 5031 5035 5090 5112 5122 5145 5172 5173 5178 5193\n",
" 5201 5211 5212 5222 5237 5331 5335 5349 5372 5402 5405 5406\n",
" 5411 5457 5472 5486 5531 5542 5545 5546 5609 5611 5634 5657\n",
" 5669 5691 5754 5757 5777 5798 5803 5831 5838 5846 5853 5915\n",
" 5930 5956 5973 5982 6000 6001 6015 6028 6082 6101 6105 6117\n",
" 6123 6132 6136 6174 6187 6238 6250 6265 6276 6278 6305 6306\n",
" 6320 6339 6378 6390 6396 6413 6426 6442 6468 6484 6535 6546\n",
" 6548 6557 6575 6688 6712 6718 6739 6742 6759 6784 6811 6829\n",
" 6851 6885 6914 6936 6959 6988 6996 6999 7005 7015 7024 7027\n",
" 7035 7106 7146 7189 7260 7293 7311 7312 7326 7344 7371 7398\n",
" 7417 7419 7421 7449 7460 7484 7523 7533 7552 7567 7591 7604\n",
" 7605 7683 7685 7704 7720 7744 7772 7777 7839 7856 7919 7954\n",
" 7967 7974 8002 8018 8033 8053 8107 8132 8146 8153 8168 8170\n",
" 8198 8212 8227 8278 8292 8300 8335 8345 8357 8385 8392 8434\n",
" 8450 8476 8503 8505 8512 8525 8547 8549 8647 8663 8668 8679\n",
" 8717 8736 8796 8803 8837 8838 8929 8979 8987 9021 9083 9105\n",
" 9185 9238 9247 9251 9256 9294 9343 9349 9371 9399 9425 9432\n",
" 9468 9500 9527 9540 9622 9629 9650 9651 9689 9695 9714 9737\n",
" 9805 9846 9854 9858 9868 9887 9890 9893 9914 9934 9940 9952\n",
" 9978 10012 10070 10078 10102 10116 10123 10137 10142 10143 10157 10158\n",
" 10168 10169 10173 10274 10312 10374 10379 10395 10421 10428 10435 10514\n",
" 10556 10557 10562 10606 10616 10638 10644 10693 10728 10730 10772 10782\n",
" 10812 10866 10916 10953 10956 10980 11008 11012 11022 11028 11063 11084\n",
" 11126 11178 11222 11227 11275 11283 11301 11336 11359 11383 11399 11437\n",
" 11452 11528 11549 11557 11558 11585 11611 11623 11665 11672 11704 11706\n",
" 11711 11721 11733 11755 11769 11774 11776 11790 11824 11846 11857 11874\n",
" 11877 11892 11924 11973 12013 12063 12132 12234 12239 12242 12248 12249\n",
" 12252 12313 12408 12424 12433 12461 12472 12498 12507 12537 12648 12654\n",
" 12660 12731 12755 12775 12778 12785 12799 12822 12839 12861 12890 12933\n",
" 12938 12952 12957 12968 12988 13027 13080 13082 13113 13129 13172 13173\n",
" 13175 13222 13236 13255 13275 13283 13296 13343 13346 13451 13462 13469\n",
" 13536 13551 13564 13567 13573 13630 13642 13661 13663 13669 13680 13688\n",
" 13693 13700 13727 13751 13752 13786 13894 13919 13956 13972 14009 14028\n",
" 14048 14090 14112 14132 14199 14200 14218 14225 14275 14298 14299 14310\n",
" 14312 14331 14341 14345 14346 14347 14348 14360 14405] are constant.\n",
" UserWarning)\n",
"C:\\Users\\Anne\\Anaconda3\\lib\\site-packages\\sklearn\\feature_selection\\univariate_selection.py:115: RuntimeWarning: invalid value encountered in true_divide\n",
" f = msb / msw\n"
]
},
{
"ename": "ValueError",
"evalue": "Target is multiclass but average='binary'. Please choose another average setting.",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-19-9dbf2b039fee>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mgrid\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtraining_data\u001b[0m \u001b[1;33m,\u001b[0m\u001b[0my_train_test\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[1;31m# DataFrame of results\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[0mdf_results\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mgrid\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcv_results_\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_search.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, X, y, groups, **fit_params)\u001b[0m\n\u001b[0;32m 720\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mresults_container\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 721\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 722\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_run_search\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mevaluate_candidates\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 723\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 724\u001b[0m \u001b[0mresults\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mresults_container\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_search.py\u001b[0m in \u001b[0;36m_run_search\u001b[1;34m(self, evaluate_candidates)\u001b[0m\n\u001b[0;32m 1189\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_run_search\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mevaluate_candidates\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1190\u001b[0m \u001b[1;34m\"\"\"Search all candidates in param_grid\"\"\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1191\u001b[1;33m \u001b[0mevaluate_candidates\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mParameterGrid\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparam_grid\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1192\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1193\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_search.py\u001b[0m in \u001b[0;36mevaluate_candidates\u001b[1;34m(candidate_params)\u001b[0m\n\u001b[0;32m 709\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mparameters\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mtrain\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtest\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 710\u001b[0m in product(candidate_params,\n\u001b[1;32m--> 711\u001b[1;33m cv.split(X, y, groups)))\n\u001b[0m\u001b[0;32m 712\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 713\u001b[0m \u001b[0mall_candidate_params\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mextend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcandidate_params\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\sklearn\\externals\\joblib\\parallel.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m 915\u001b[0m \u001b[1;31m# remaining jobs.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 916\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_iterating\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mFalse\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 917\u001b[1;33m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdispatch_one_batch\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0miterator\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 918\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_iterating\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_original_iterator\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 919\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\sklearn\\externals\\joblib\\parallel.py\u001b[0m in \u001b[0;36mdispatch_one_batch\u001b[1;34m(self, iterator)\u001b[0m\n\u001b[0;32m 757\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[1;32mFalse\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 758\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 759\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_dispatch\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtasks\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 760\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 761\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\sklearn\\externals\\joblib\\parallel.py\u001b[0m in \u001b[0;36m_dispatch\u001b[1;34m(self, batch)\u001b[0m\n\u001b[0;32m 714\u001b[0m \u001b[1;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_lock\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 715\u001b[0m \u001b[0mjob_idx\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_jobs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 716\u001b[1;33m \u001b[0mjob\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mapply_async\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mbatch\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcallback\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mcb\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 717\u001b[0m \u001b[1;31m# A job can complete so quickly than its callback is\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 718\u001b[0m \u001b[1;31m# called before we get here, causing self._jobs to\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\sklearn\\externals\\joblib\\_parallel_backends.py\u001b[0m in \u001b[0;36mapply_async\u001b[1;34m(self, func, callback)\u001b[0m\n\u001b[0;32m 180\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mapply_async\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcallback\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 181\u001b[0m \u001b[1;34m\"\"\"Schedule a func to be run\"\"\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 182\u001b[1;33m \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mImmediateResult\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 183\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mcallback\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 184\u001b[0m \u001b[0mcallback\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\sklearn\\externals\\joblib\\_parallel_backends.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, batch)\u001b[0m\n\u001b[0;32m 547\u001b[0m \u001b[1;31m# Don't delay the application, to avoid keeping the input\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 548\u001b[0m \u001b[1;31m# arguments in memory\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 549\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mresults\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mbatch\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 550\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 551\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\sklearn\\externals\\joblib\\parallel.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 223\u001b[0m \u001b[1;32mwith\u001b[0m \u001b[0mparallel_backend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mn_jobs\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_n_jobs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 224\u001b[0m return [func(*args, **kwargs)\n\u001b[1;32m--> 225\u001b[1;33m for func, args, kwargs in self.items]\n\u001b[0m\u001b[0;32m 226\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 227\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m__len__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\sklearn\\externals\\joblib\\parallel.py\u001b[0m in \u001b[0;36m<listcomp>\u001b[1;34m(.0)\u001b[0m\n\u001b[0;32m 223\u001b[0m \u001b[1;32mwith\u001b[0m \u001b[0mparallel_backend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mn_jobs\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_n_jobs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 224\u001b[0m return [func(*args, **kwargs)\n\u001b[1;32m--> 225\u001b[1;33m for func, args, kwargs in self.items]\n\u001b[0m\u001b[0;32m 226\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 227\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m__len__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_validation.py\u001b[0m in \u001b[0;36m_fit_and_score\u001b[1;34m(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)\u001b[0m\n\u001b[0;32m 566\u001b[0m \u001b[0mfit_time\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m-\u001b[0m \u001b[0mstart_time\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 567\u001b[0m \u001b[1;31m# _score will return dict if is_multimetric is True\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 568\u001b[1;33m \u001b[0mtest_scores\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_score\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mestimator\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mX_test\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_test\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mscorer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mis_multimetric\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 569\u001b[0m \u001b[0mscore_time\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m-\u001b[0m \u001b[0mstart_time\u001b[0m \u001b[1;33m-\u001b[0m \u001b[0mfit_time\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 570\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mreturn_train_score\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_validation.py\u001b[0m in \u001b[0;36m_score\u001b[1;34m(estimator, X_test, y_test, scorer, is_multimetric)\u001b[0m\n\u001b[0;32m 603\u001b[0m \"\"\"\n\u001b[0;32m 604\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mis_multimetric\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 605\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0m_multimetric_score\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mestimator\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mX_test\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_test\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mscorer\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 606\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 607\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0my_test\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_validation.py\u001b[0m in \u001b[0;36m_multimetric_score\u001b[1;34m(estimator, X_test, y_test, scorers)\u001b[0m\n\u001b[0;32m 633\u001b[0m \u001b[0mscore\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mscorer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mestimator\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mX_test\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 634\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 635\u001b[1;33m \u001b[0mscore\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mscorer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mestimator\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mX_test\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_test\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 636\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 637\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mscore\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'item'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\scorer.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, estimator, X, y_true, sample_weight)\u001b[0m\n\u001b[0;32m 96\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 97\u001b[0m return self._sign * self._score_func(y_true, y_pred,\n\u001b[1;32m---> 98\u001b[1;33m **self._kwargs)\n\u001b[0m\u001b[0;32m 99\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 100\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py\u001b[0m in \u001b[0;36mf1_score\u001b[1;34m(y_true, y_pred, labels, pos_label, average, sample_weight)\u001b[0m\n\u001b[0;32m 718\u001b[0m return fbeta_score(y_true, y_pred, 1, labels=labels,\n\u001b[0;32m 719\u001b[0m \u001b[0mpos_label\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mpos_label\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maverage\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0maverage\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 720\u001b[1;33m sample_weight=sample_weight)\n\u001b[0m\u001b[0;32m 721\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 722\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py\u001b[0m in \u001b[0;36mfbeta_score\u001b[1;34m(y_true, y_pred, beta, labels, pos_label, average, sample_weight)\u001b[0m\n\u001b[0;32m 832\u001b[0m \u001b[0maverage\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0maverage\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 833\u001b[0m \u001b[0mwarn_for\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'f-score'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 834\u001b[1;33m sample_weight=sample_weight)\n\u001b[0m\u001b[0;32m 835\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 836\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py\u001b[0m in \u001b[0;36mprecision_recall_fscore_support\u001b[1;34m(y_true, y_pred, beta, labels, pos_label, average, warn_for, sample_weight)\u001b[0m\n\u001b[0;32m 1045\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1046\u001b[0m raise ValueError(\"Target is %s but average='binary'. Please \"\n\u001b[1;32m-> 1047\u001b[1;33m \"choose another average setting.\" % y_type)\n\u001b[0m\u001b[0;32m 1048\u001b[0m \u001b[1;32melif\u001b[0m \u001b[0mpos_label\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32min\u001b[0m \u001b[1;33m(\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1049\u001b[0m warnings.warn(\"Note that pos_label (set to %r) is ignored when \"\n",
"\u001b[1;31mValueError\u001b[0m: Target is multiclass but average='binary'. Please choose another average setting."
]
}
],
"source": [
"# DataFrame of results\n",
"df_results = grid.cv_results_\n",
"\n",
"# print results\n",
"print('RESULTS:')\n",
"print('')\n",
"print('mean_test_score:')\n",
"print(df_results['mean_test_score'])\n",
"print('')\n",
"print('mean of means:')\n",
"print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score']))\n",
"print('')\n",
"print('best score:')\n",
"print(grid.best_score_)\n",
"print()\n",
"print('best parameters set found on development set:')\n",
"print(grid.best_params_)\n",
"print()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Apply Label Spreading Algorithm without grid search:"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"number of wrong estimated articles: 135\n"
]
}
],
"source": [
"classifier = label_propagation.LabelSpreading(kernel='knn', n_neighbors=9)\n",
"\n",
"# fit classifier\n",
"classifier.fit(training_data, y_train_test)\n",
"\n",
"# Predict class\n",
"predictions = classifier.predict(testing_data)\n",
"\n",
"n = 0\n",
"for i in range(len(y_train_test)):\n",
" if y_train_test[i] != predictions[i]:\n",
" n += 1\n",
" #print('error no.{}'.format(n))\n",
" #print('prediction at index {} is: {}, but actual is: {}'\n",
" #.format(i, predictions[i], y_train_test[i]))\n",
" #print(X_train_test[i])\n",
" #print(y_train_test[i])\n",
" #print()\n",
"if n==0:\n",
" print('no resubstitution error :-)')\n",
"else:\n",
" print('number of wrong estimated articles: {}'.format(n))"
]
},
{
@ -154,70 +509,169 @@
"metadata": {},
"outputs": [],
"source": [
"# fit classifier\n",
"classifier.fit(training_data, y_train_test)"
"# split labeled data into text and label set\n",
"# join title and text\n",
"X = df.loc[df['Label'] != -1, 'Title'] + '. ' + df.loc[df['Label'] != -1, 'Text']\n",
"y = df.loc[df['Label'] != -1, 'Label']\n",
"\n",
"# split unlabeled data into text and label set\n",
"# join title and text\n",
"#U = unlabeled_data['Title'] + '. ' + unlabeled_data['Text']\n",
"#l = unlabeled_data['Label']"
]
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"#fit classifier\n",
"classifier.fit(training_data, y_train_test)\n",
"\n",
"# probability estimates for the test vector\n",
"class_probs = classifier.predict_proba(testing_data)\n",
"\n",
"predictions = classifier.predict(testing_data)\n",
"\n",
"distributions = classifier.label_distributions_"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"# BOW: extracting all words from articles...\n",
"\n",
"# BOW: making vocabulary of data set...\n",
"\n",
"# BOW: vocabulary consists of 14414 features.\n",
"\n",
"# BOW: calculating matrix...\n",
"\n",
"# BOW: calculating frequencies...\n",
"\n",
"Errors at index:\n",
"\n"
]
},
{
"ename": "KeyError",
"evalue": "0",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<timed eval>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n",
"\u001b[1;32m~\\BA\\Python\\src\\MNBInteractive.py\u001b[0m in \u001b[0;36manalyze_errors\u001b[1;34m(dataset, sklearn_cv)\u001b[0m\n\u001b[0;32m 252\u001b[0m \u001b[0mn\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 253\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my_train_test\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 254\u001b[1;33m \u001b[1;32mif\u001b[0m \u001b[0my_train_test\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m!=\u001b[0m \u001b[0mpredictions\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 255\u001b[0m \u001b[0mn\u001b[0m \u001b[1;33m+=\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 256\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'error no.{}'\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mn\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\core\\series.py\u001b[0m in \u001b[0;36m__getitem__\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m 765\u001b[0m \u001b[0mkey\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcom\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_apply_if_callable\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 766\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 767\u001b[1;33m \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_value\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 768\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 769\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mis_scalar\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\core\\indexes\\base.py\u001b[0m in \u001b[0;36mget_value\u001b[1;34m(self, series, key)\u001b[0m\n\u001b[0;32m 3116\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3117\u001b[0m return self._engine.get_value(s, k,\n\u001b[1;32m-> 3118\u001b[1;33m tz=getattr(series.dtype, 'tz', None))\n\u001b[0m\u001b[0;32m 3119\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me1\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3120\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m>\u001b[0m \u001b[1;36m0\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0minferred_type\u001b[0m \u001b[1;32min\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;34m'integer'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'boolean'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32mpandas\\_libs\\index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_value\u001b[1;34m()\u001b[0m\n",
"\u001b[1;32mpandas\\_libs\\index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_value\u001b[1;34m()\u001b[0m\n",
"\u001b[1;32mpandas\\_libs\\index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[1;34m()\u001b[0m\n",
"\u001b[1;32mpandas\\_libs\\hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.Int64HashTable.get_item\u001b[1;34m()\u001b[0m\n",
"\u001b[1;32mpandas\\_libs\\hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.Int64HashTable.get_item\u001b[1;34m()\u001b[0m\n",
"\u001b[1;31mKeyError\u001b[0m: 0"
]
"data": {
"text/plain": [
"array([[8.54899788e-01, 1.40068870e-01, 5.03134154e-03],\n",
" [9.70788345e-01, 1.39376264e-02, 1.52740287e-02],\n",
" [4.11869680e-01, 2.91851521e-01, 2.96278799e-01],\n",
" [9.71611165e-01, 4.28659523e-03, 2.41022402e-02],\n",
" [8.52625238e-01, 1.45191186e-01, 2.18357550e-03],\n",
" [9.91566300e-01, 4.47034922e-04, 7.98666502e-03],\n",
" [8.78047780e-01, 1.22311047e-03, 1.20729110e-01],\n",
" [9.71644767e-01, 2.14997503e-02, 6.85548226e-03],\n",
" [6.46595814e-03, 4.26336042e-01, 5.67197999e-01],\n",
" [8.29852617e-01, 7.24799766e-03, 1.62899386e-01],\n",
" [9.95544213e-01, 1.66494962e-03, 2.79083714e-03],\n",
" [2.80424772e-01, 4.30444437e-01, 2.89130790e-01],\n",
" [9.98002870e-01, 8.90665783e-04, 1.10646389e-03],\n",
" [6.99829073e-01, 1.36489841e-01, 1.63681085e-01],\n",
" [8.48882796e-01, 5.84098963e-03, 1.45276214e-01],\n",
" [4.11292646e-01, 1.50316719e-01, 4.38390635e-01],\n",
" [4.28143896e-01, 1.46699197e-01, 4.25156907e-01],\n",
" [9.87880839e-01, 8.27858840e-03, 3.84057246e-03],\n",
" [8.43079733e-01, 1.17787490e-03, 1.55742392e-01],\n",
" [9.93829449e-01, 1.51747399e-04, 6.01880345e-03],\n",
" [5.73381787e-01, 1.69435088e-03, 4.24923862e-01],\n",
" [8.33533735e-01, 1.49002260e-01, 1.74640056e-02],\n",
" [5.65703640e-01, 1.44979114e-01, 2.89317246e-01],\n",
" [8.42269853e-01, 1.45923802e-01, 1.18063447e-02],\n",
" [5.93840663e-01, 2.88471049e-01, 1.17688289e-01],\n",
" [7.13809112e-01, 1.42513632e-01, 1.43677256e-01],\n",
" [4.19067903e-01, 2.82252991e-01, 2.98679107e-01],\n",
" [8.53309230e-01, 1.45133403e-01, 1.55736739e-03],\n",
" [5.61436668e-01, 1.44922054e-01, 2.93641279e-01],\n",
" [9.97789120e-01, 1.37537683e-03, 8.35503219e-04],\n",
" [7.02629519e-01, 1.42899424e-01, 1.54471056e-01],\n",
" [7.12540834e-01, 2.08167270e-02, 2.66642439e-01],\n",
" [6.99324567e-01, 1.46059237e-01, 1.54616196e-01],\n",
" [9.69351366e-03, 4.24987211e-01, 5.65319275e-01],\n",
" [8.44432151e-01, 5.12661190e-04, 1.55055187e-01],\n",
" [5.48445760e-01, 1.57235221e-01, 2.94319018e-01],\n",
" [8.48674202e-01, 5.96272754e-03, 1.45363070e-01],\n",
" [9.90185775e-01, 5.17028487e-03, 4.64393985e-03],\n",
" [8.44944088e-01, 3.97510322e-03, 1.51080809e-01],\n",
" [9.97787411e-01, 1.31544291e-03, 8.97145939e-04],\n",
" [5.65456296e-01, 2.69399990e-01, 1.65143714e-01],\n",
" [9.84511758e-01, 2.58605471e-03, 1.29021878e-02],\n",
" [7.03085060e-01, 4.58340403e-03, 2.92331536e-01],\n",
" [8.90069495e-01, 8.49338724e-03, 1.01437118e-01],\n",
" [5.67088192e-01, 2.87569948e-01, 1.45341860e-01],\n",
" [1.53181744e-01, 6.92570819e-01, 1.54247437e-01],\n",
" [5.52812978e-01, 1.61409286e-02, 4.31046094e-01],\n",
" [4.48040188e-01, 1.50274670e-01, 4.01685142e-01],\n",
" [5.62875251e-01, 1.46857339e-01, 2.90267410e-01],\n",
" [8.53805515e-01, 1.21296712e-03, 1.44981518e-01],\n",
" [1.53660151e-01, 4.30249473e-01, 4.16090376e-01],\n",
" [6.94354561e-01, 1.45983430e-01, 1.59662009e-01],\n",
" [5.84112538e-01, 1.46038404e-01, 2.69849059e-01],\n",
" [9.83450131e-01, 4.04028759e-03, 1.25095816e-02],\n",
" [8.38687372e-01, 7.18563503e-03, 1.54126993e-01],\n",
" [5.81722514e-01, 4.60947508e-03, 4.13668011e-01],\n",
" [5.57941214e-01, 2.10876534e-03, 4.39950021e-01],\n",
" [9.95224265e-01, 1.98639739e-04, 4.57709476e-03],\n",
" [9.79413085e-01, 4.24329744e-04, 2.01625855e-02],\n",
" [5.79619256e-01, 5.17521337e-03, 4.15205530e-01],\n",
" [9.92882118e-01, 2.35891031e-03, 4.75897199e-03],\n",
" [8.51341515e-01, 3.64750635e-03, 1.45010979e-01],\n",
" [8.45339733e-01, 3.63816454e-03, 1.51022102e-01],\n",
" [9.98872515e-01, 1.78547939e-05, 1.10962986e-03],\n",
" [8.14074062e-01, 9.54940956e-04, 1.84970997e-01],\n",
" [8.78046593e-01, 1.21129948e-03, 1.20742107e-01],\n",
" [6.95031948e-01, 8.34917684e-03, 2.96618875e-01],\n",
" [3.31850699e-01, 2.37973260e-01, 4.30176040e-01],\n",
" [9.89237516e-01, 7.49143364e-03, 3.27105062e-03],\n",
" [9.97714937e-01, 1.68473457e-03, 6.00328637e-04],\n",
" [4.19209822e-01, 1.44072823e-01, 4.36717355e-01],\n",
" [2.94591725e-01, 2.89125831e-01, 4.16282443e-01],\n",
" [9.75982721e-01, 2.75243090e-04, 2.37420360e-02],\n",
" [9.97107481e-01, 6.97604109e-05, 2.82275900e-03],\n",
" [5.51016156e-01, 1.00351119e-02, 4.38948732e-01],\n",
" [9.96805873e-01, 3.19999504e-04, 2.87412742e-03],\n",
" [4.42435017e-01, 2.52119511e-03, 5.55043787e-01],\n",
" [5.73698418e-01, 1.41414109e-01, 2.84887473e-01],\n",
" [6.95375543e-01, 1.49726181e-01, 1.54898276e-01],\n",
" [2.81028664e-01, 2.86590544e-01, 4.32380792e-01],\n",
" [7.08521191e-01, 9.19214465e-03, 2.82286664e-01],\n",
" [7.03662405e-01, 7.10271014e-04, 2.95627324e-01],\n",
" [1.53347487e-01, 1.35979720e-01, 7.10672793e-01],\n",
" [7.12109740e-01, 1.44001317e-01, 1.43888943e-01],\n",
" [9.96996349e-01, 9.61612147e-04, 2.04203904e-03],\n",
" [5.62854012e-01, 1.47352408e-01, 2.89793580e-01],\n",
" [8.53494351e-01, 5.18793471e-04, 1.45986856e-01],\n",
" [8.78047780e-01, 1.22311047e-03, 1.20729110e-01],\n",
" [9.92175376e-01, 2.17497733e-03, 5.64964635e-03],\n",
" [8.26498534e-01, 7.08476057e-03, 1.66416705e-01],\n",
" [6.79028398e-01, 1.44096986e-01, 1.76874616e-01],\n",
" [7.12800472e-01, 1.42820243e-01, 1.44379284e-01],\n",
" [9.96162143e-01, 2.01479906e-03, 1.82305796e-03],\n",
" [7.11784885e-01, 1.43809245e-01, 1.44405870e-01],\n",
" [8.33037118e-01, 7.70962537e-03, 1.59253256e-01],\n",
" [8.53978607e-01, 1.43243201e-01, 2.77819155e-03],\n",
" [9.93520576e-01, 1.88199468e-03, 4.59742883e-03],\n",
" [5.61084959e-01, 4.25588605e-03, 4.34659155e-01],\n",
" [7.14798944e-01, 1.08574273e-03, 2.84115314e-01],\n",
" [7.26043968e-01, 1.44468580e-01, 1.29487453e-01]])"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Predict class\n",
"predictions = classifier.predict(testing_data)\n",
"print('Errors at index:')\n",
"print()\n",
"n = 0\n",
"for i in range(len(y_train_test)):\n",
" if y_train_test[i] != predictions[i]:\n",
" n += 1\n",
" print('error no.{}'.format(n))\n",
" print('prediction at index {} is: {}, but actual is: {}'\n",
" .format(i, predictions[i], y_train_test[i]))\n",
" print(X_train_test[i])\n",
" print(y_train_test[i])\n",
" print()\n",
"#print metrics\n",
"print('F1 score: ', format(f1_score(y_train_test, predictions)))"
"class_probs[:100]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"predictions[:100]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"distributions[:100]"
]
},
{
@ -241,6 +695,21 @@
"# call script with manually labeled and manually unlabeled samples\n",
"%time MNBInteractive.measure_mnb(X, y, cv)"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
"# save corrected round to csv\n",
"df.to_csv('../data/interactive_labeling_round_{}_corrected.csv'.format(m),\n",
" sep='|',\n",
" mode='w',\n",
" encoding='utf-8',\n",
" quoting=csv.QUOTE_NONNUMERIC,\n",
" quotechar='\\'')"
]
}
],
"metadata": {

File diff suppressed because it is too large Load Diff

View File

@ -23,267 +23,285 @@ from nltk.stem.porter import PorterStemmer
class BagOfWords:
def fit_transform(corpus, rel_freq=True, stemming=True):
''' similar to CountVectorizer's fit_transform method
'''
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
vocab = BagOfWords.make_vocab(extracted_words, stemming)
matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq,
stemming)
return matrix
def fit_transform(corpus, rel_freq=True, stemming=True):
''' similar to CountVectorizer's fit_transform method
'''
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
vocab = BagOfWords.make_vocab(extracted_words, stemming)
matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq,
stemming)
return matrix
def extract_words(text, stemming=True):
'''takes article as argument, removes numbers,
returns list of single words, recurrences included.
'''
stemmer = PorterStemmer()
stop_words = BagOfWords.set_stop_words(stemming)
def extract_words(text, stemming=True):
'''takes article as argument, removes numbers,
returns list of single words, recurrences included.
'''
stemmer = PorterStemmer()
stop_words = BagOfWords.set_stop_words(stemming)
# ignore company names
company_names_list = BagOfWords.load_company_names()
for company in company_names_list:
text = text.replace(company, '')
# ignore company names
company_names_list = BagOfWords.load_company_names()
for company in company_names_list:
text = text.replace(company, '')
# replace punctuation marks with spaces
words = re.sub(r'\W', ' ', text)
# split str into list of single words
words = words.split()
# list of all words to return
words_cleaned = []
for word in words:
word = word.lower()
# check if alphabetic and not stop word
if (word.isalpha()):# and word not in stop_words):
if stemming:
# reduce word to its stem
word = stemmer.stem(word)
# filter out spam chars
word = word.replace('â', '').replace('œ', '')\
.replace('ã', '')
words_cleaned.append(word)
return words_cleaned
# replace punctuation marks with spaces
words = re.sub(r'\W', ' ', text)
# split str into list of single words
words = words.split()
# list of all words to return
words_cleaned = []
for word in words:
word = word.lower()
# check if alphabetic and not stop word
if (word.isalpha()):# and word not in stop_words):
if stemming:
# reduce word to its stem
word = stemmer.stem(word)
# filter out spam chars
word = word.replace('â', '').replace('œ', '')\
.replace('ã', '')
words_cleaned.append(word)
return words_cleaned
def extract_all_words(corpus, stemming=True):
'''param: all articles of corpus
returns list of lists of all extracted words, one row per article
'''
extracted_words = []
print('# BOW: extracting all words from articles...')
print()
for text in corpus:
extracted_words.append(BagOfWords.extract_words(text, stemming))
def extract_all_words(corpus, stemming=True):
'''param: all articles of corpus
returns list of lists of all extracted words, one row per article
'''
extracted_words = []
print('# BOW: extracting all words from articles...')
print()
for text in corpus:
extracted_words.append(BagOfWords.extract_words(text, stemming))
return extracted_words
return extracted_words
def make_matrix(extracted_words, vocab, rel_freq=True, stemming=True):
'''calculates word stem frequencies in input articles. returns
document term matrix(DataFrame) with relative word frequencies
(0 <= values < 1) if relative_word_frequencies=True or absolute
word frequencies (int) if relative_word_frequencies=False.
(rows: different articles, colums: different words in vocab)
returns matrix as DataFrame
'''
print('# BOW: calculating matrix...')
print()
def make_matrix(extracted_words, vocab, rel_freq=True, stemming=True):
'''calculates word stem frequencies in input articles. returns
document term matrix(DataFrame) with relative word frequencies
(0 <= values < 1) if relative_word_frequencies=True or absolute
word frequencies (int) if relative_word_frequencies=False.
(rows: different articles, colums: different words in vocab)
returns matrix as DataFrame
'''
print('# BOW: calculating matrix...')
print()
# total number of words in bag of words
word_count = 0
# total number of words in bag of words
word_count = 0
for list in extracted_words:
word_count += len(list)
for list in extracted_words:
word_count += len(list)
# number of articles
n_articles = len(extracted_words)
# number of words in vocab
l_vocab = len(vocab)
# number of articles
n_articles = len(extracted_words)
# number of words in vocab
l_vocab = len(vocab)
# create zero-filled dataframe
array = np.zeros(shape=(n_articles, l_vocab))
df_matrix = pd.DataFrame(array, columns=vocab)
# create zero-filled dataframe
array = np.zeros(shape=(n_articles, l_vocab))
df_matrix = pd.DataFrame(array, columns=vocab)
print('# BOW: calculating frequencies...')
print()
print('# BOW: calculating frequencies...')
print()
# for every text in series
for i in range(len(extracted_words)):
# for every text in series
for i in range(len(extracted_words)):
# extract words of single article
words = extracted_words[i]
# extract words of single article
words = extracted_words[i]
for v in vocab:
# for every word in article
for w in words:
# find right position
if w == v:
if rel_freq:
# relative word frequency
df_matrix.loc[i][v] += 1/word_count
else:
# absolute word frequency
df_matrix.loc[i][v] += 1
return df_matrix
for v in vocab:
# for every word in article
for w in words:
# find right position
if w == v:
if rel_freq:
# relative word frequency
df_matrix.loc[i][v] += 1/word_count
else:
# absolute word frequency
df_matrix.loc[i][v] += 1
return df_matrix
def make_vocab(extracted_words, stemming=True):
'''adds all words to a global vocabulary.
input: list of lists of all extracted words, returns: list of words
'''
print('# BOW: making vocabulary of data set...')
print()
vocab = set()
# for every article's text
for e_list in extracted_words:
for word in e_list:
# add every single word to vocabulary
vocab.add(word)
print('# BOW: vocabulary consists of {} features.'.format(len(vocab)))
print()
# transform set to list
return list(vocab)
def make_vocab(extracted_words, stemming=True):
'''adds all words to a global vocabulary.
input: list of lists of all extracted words, returns: list of words
'''
print('# BOW: making vocabulary of data set...')
print()
vocab = set()
# for every article's text
for e_list in extracted_words:
for word in e_list:
# add every single word to vocabulary
vocab.add(word)
print('# BOW: vocabulary consists of {} features.'.format(len(vocab)))
print()
# transform set to list
return list(vocab)
def load_company_names():
# load pickle object of organizations
with open('../obj/dict_organizations.pkl', 'rb') as input:
dict = pickle.load(input)
list = []
for key in dict.keys():
list.append(key)
return list
def load_company_names():
# load pickle object of organizations
with open('../obj/dict_organizations.pkl', 'rb') as input:
dict = pickle.load(input)
list = []
for key in dict.keys():
list.append(key)
return list
def set_stop_words(stemming=True):
'''creates list of all words that will be ignored:
stopwords, company names and other disturbing terms
'''
# stopwords
stop_words = ['a', 'about', 'above', 'after', 'again', 'against',
'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren',
'aren\'t', 'as', 'at', 'be', 'because', 'been',
'before', 'being', 'below', 'between', 'both', 'but',
'by', 'can', 'couldn', 'couldn\'t', 'd', 'did', 'didn',
'didn\'t', 'do', 'does', 'doesn', 'doesn\'t', 'doing',
'don', 'don\'t', 'down', 'during', 'each', 'few',
'for', 'from', 'further', 'had', 'hadn', 'hadn\'t',
'has', 'hasn', 'hasn\'t', 'have', 'haven', 'haven\'t',
'having', 'he', 'her', 'here', 'hers', 'herself', 'him',
'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is',
'isn', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'just',
'll', 'm', 'ma', 'me', 'mightn', 'mightn\'t', 'more',
'most', 'mustn', 'mustn\'t', 'my', 'myself', 'needn',
'needn\'t', 'no', 'nor', 'not', 'now', 'o', 'of', 'off',
'on', 'once', 'only', 'or', 'other', 'our', 'ours',
'ourselves', 'out', 'over', 'own', 're', 's', 'same',
'shan', 'shan\'t', 'she', 'she\'s', 'should',
'should\'ve', 'shouldn', 'shouldn\'t', 'so', 'some',
'such', 't', 'than', 'that', 'that\'ll', 'the', 'their',
'theirs', 'them', 'themselves', 'then', 'there',
'these', 'they', 'this', 'those', 'through', 'to',
'too', 'under', 'until', 'up', 've', 'very', 'was',
'wasn', 'wasn\'t', 'we', 'were', 'weren', 'weren\'t',
'what', 'when', 'where', 'which', 'while', 'who',
'whom', 'why', 'will', 'with', 'won', 'won\'t',
'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll',
'you\'re', 'you\'ve', 'your', 'yours', 'yourself',
'yourselves']
def set_stop_words(stemming=True):
'''creates list of all words that will be ignored:
stopwords, company names and other disturbing terms
'''
# stopwords
stop_words = ['a', 'about', 'above', 'after', 'again', 'against',
'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren',
'aren\'t', 'as', 'at', 'be', 'because', 'been',
'before', 'being', 'below', 'between', 'both', 'but',
'by', 'can', 'couldn', 'couldn\'t', 'd', 'did', 'didn',
'didn\'t', 'do', 'does', 'doesn', 'doesn\'t', 'doing',
'don', 'don\'t', 'down', 'during', 'each', 'few',
'for', 'from', 'further', 'had', 'hadn', 'hadn\'t',
'has', 'hasn', 'hasn\'t', 'have', 'haven', 'haven\'t',
'having', 'he', 'her', 'here', 'hers', 'herself', 'him',
'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is',
'isn', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'just',
'll', 'm', 'ma', 'me', 'mightn', 'mightn\'t', 'more',
'most', 'mustn', 'mustn\'t', 'my', 'myself', 'needn',
'needn\'t', 'no', 'nor', 'not', 'now', 'o', 'of', 'off',
'on', 'once', 'only', 'or', 'other', 'our', 'ours',
'ourselves', 'out', 'over', 'own', 're', 's', 'same',
'shan', 'shan\'t', 'she', 'she\'s', 'should',
'should\'ve', 'shouldn', 'shouldn\'t', 'so', 'some',
'such', 't', 'than', 'that', 'that\'ll', 'the', 'their',
'theirs', 'them', 'themselves', 'then', 'there',
'these', 'they', 'this', 'those', 'through', 'to',
'too', 'under', 'until', 'up', 've', 'very', 'was',
'wasn', 'wasn\'t', 'we', 'were', 'weren', 'weren\'t',
'what', 'when', 'where', 'which', 'while', 'who',
'whom', 'why', 'will', 'with', 'won', 'won\'t',
'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll',
'you\'re', 'you\'ve', 'your', 'yours', 'yourself',
'yourselves']
#add unwanted terms
stop_words.extend(['reuters', 'reuter', 'bloomberg', 'cnn', 'n', 'l',
'file', 'photo', 'min', 'read', 'staff', 'left', 'â',
'right', 'updated', 'minutes', 'brief', 'editing',
'reporting', 'ago', 'also', 'would', 'could',
'bit', 'ly', 'fy', 'economist', 'u', 'guardian'])
#add unwanted terms
stop_words.extend(['reuters', 'reuter', 'bloomberg', 'cnn', 'n', 'l',
'file', 'photo', 'min', 'read', 'staff', 'left', 'â',
'right', 'updated', 'minutes', 'brief', 'editing',
'reporting', 'ago', 'also', 'would', 'could',
'bit', 'ly', 'fy', 'economist', 'u', 'guardian'])
stop_words.extend(['monday', 'tuesday', 'wednesday', 'thursday', 'friday',
'saturday', 'sunday'])
stop_words.extend(['monday', 'tuesday', 'wednesday', 'thursday', 'friday',
'saturday', 'sunday'])
stop_words.extend(['january', 'february', 'march', 'april', 'may',
'june', 'july', 'august', 'september', 'october',
'november', 'december', 'jan', 'feb', 'mar', 'apr',
'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov',
'dec'])
stop_words.extend(['january', 'february', 'march', 'april', 'may',
'june', 'july', 'august', 'september', 'october',
'november', 'december', 'jan', 'feb', 'mar', 'apr',
'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov',
'dec'])
if stemming:
stemmer = PorterStemmer()
for i in range(len(stop_words)):
# reduce stop words to stem
stop_words[i] = stemmer.stem(stop_words[i])
# transform list to set to eliminate duplicates
return set(stop_words)
if stemming:
stemmer = PorterStemmer()
for i in range(len(stop_words)):
# reduce stop words to stem
stop_words[i] = stemmer.stem(stop_words[i])
# transform list to set to eliminate duplicates
return set(stop_words)
def make_dict_common_words(df_matrix, n=200, rel_freq=True, stemming=True):
'''params: DataFrame document term matrix of complete data set,
number of n most common words.
returns: dict of words with their count.
'''
print('# making dictionary of most common words...')
print()
def make_dict_common_words(df_matrix, n=200, rel_freq=True, stemming=True):
'''params: DataFrame document term matrix of complete data set,
number of n most common words.
returns: dict of words with their count.
'''
print('# making dictionary of most common words...')
print()
# words under that rel_freq limit are not included
# set limit
limit = 0.0001
if not rel_freq:
limit = len(df_matrix) * 0.0001
# words under that rel_freq limit are not included
# set limit
limit = 0.0001
if not rel_freq:
limit = len(df_matrix) * 0.0001
# word => count
dict = {}
# word => count
dict = {}
# iterate over words
for column in df_matrix:
# count word mentions in total
if (df_matrix[column].sum() > limit):
dict[column] = df_matrix[column].sum()
# iterate over words
for column in df_matrix:
# count word mentions in total
if (df_matrix[column].sum() > limit):
dict[column] = df_matrix[column].sum()
# sort dict by value
o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
reverse=True))
print(o_dict)
# return n higest values as dict (word => count)
n_dict = {}
# sort dict by value
o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
reverse=True))
print(o_dict)
# return n higest values as dict (word => count)
n_dict = {}
for i in range(n):
# next highest score
next_highest = o_dict.popitem(last=False)
n_dict[next_highest[0]] = next_highest[1]
for i in range(n):
# next highest score
next_highest = o_dict.popitem(last=False)
n_dict[next_highest[0]] = next_highest[1]
# save n_dict object
with open('../obj/'+ 'dict_200_most_common_words' + '.pkl', 'wb') as f:
pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
# save n_dict object
with open('../obj/'+ 'dict_200_most_common_words' + '.pkl', 'wb') as f:
pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
return n_dict
return n_dict
def count_features(texts, stemming=True):
''' count total number of features in textual corpus
'''
print('# BOW: counting all features in corpus...')
print()
vocab = BagOfWords.make_vocab(texts, stemming)
return len(vocab)
def count_features(texts, stemming=True):
''' count total number of features in textual corpus
'''
print('# BOW: counting all features in corpus...')
print()
vocab = BagOfWords.make_vocab(texts, stemming)
return len(vocab)
def count_all_words(texts):
print('# counting all words in corpus...')
print()
sum = 0
for text in texts:
sum += len(text.split())
return sum
def count_all_words(texts):
print('# counting all words in corpus...')
print()
sum = 0
for text in texts:
sum += len(text.split())
return sum
def test():
file = '..\\data\\cleaned_data_set_without_header.csv'
df_dataset = pd.read_csv(file,
delimiter='|',
header=None,
index_col=None,
engine='python',
usecols=[1,2],
#nrows=100,
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
def test():
file = '..\\data\\cleaned_data_set_without_header.csv'
df_dataset = pd.read_csv(file,
delimiter='|',
header=None,
index_col=None,
engine='python',
usecols=[1,2],
#nrows=100,
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
corpus = df_dataset[1] + '. ' + df_dataset[2]
stemming = True
rel_freq = True
#print(BagOfWords.count_features(corpus))
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
vocab = BagOfWords.make_vocab(extracted_words, stemming)
print(len(vocab))
corpus = df_dataset[1] + '. ' + df_dataset[2]
stemming = True
rel_freq = True
#print(BagOfWords.count_features(corpus))
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
vocab = BagOfWords.make_vocab(extracted_words, stemming)
print(len(vocab))
if __name__ == '__main__':
BagOfWords.test()
stemmer = PorterStemmer()
text = 'German Economy Minister Peter Altmaier said on Tuesday that he did not favor getting ministerial approval for deals such as the proposal to merge Siemens and Alstoms rail businesses to better compete in Europe and abroad.'
# replace punctuation marks with spaces
words = re.sub(r'\W', ' ', text)
# split str into list of single words
words = words.split()
# list of all words to return
words_cleaned = []
for word in words:
word = word.lower()
# check if alphabetic and not stop word
if (word.isalpha()):# and word not in stop_words):
# reduce word to its stem
word = stemmer.stem(word)
# filter out spam chars
word = word.replace('â', '').replace('œ', '')\
.replace('ã', '')
words_cleaned.append(word)
print(words_cleaned)

View File

@ -1,42 +1,92 @@
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pickle
# round numbers
round = [0,1,2,3,4,5,6,7,8,9]
class LabelingPlotter():
# number of wrong estimated labels per round
wrong = [0/100, 19/100, 17/100, 16/100, 20/100, 12/100, 10/100, 20/100, 14/100, 12/100]
def plot_labeling_rounds():
# round numbers
round = [0,1,2,3,4,5,6,7,8,9]
# number of manual classified articles per class and round
man_0 = [84, 165, 247, 329, 410, 498, 586, 662, 741, 821]
man_1 = [3, 7, 12, 16, 20, 22, 23, 29, 37, 39]
man_2 = [13, 28, 41, 55, 70, 80, 91, 109, 122, 140]
# number of wrong estimated labels per round
wrong = [0/100, 19/100, 17/100, 16/100, 20/100, 12/100, 10/100, 20/100, 14/100, 12/100]
# number of estimated labels per class and round
est_0 = [9873/9900, 9757/9800, 9603/9700, 9470/9600, 9735/9500, 9238/9400, 9107/9300, 8007/9200, 8064/9100, 7641/9000]
est_1 = [14/9900, 15/9800, 11/9700, 11/9600, 16/9500, 17/9400, 18/9300, 19/9200, 18/9100, 20/9000]
est_2 = [12/9900, 26/9800, 77/9700, 94/9600, 380/9500, 123/9400, 147/9300, 676/9200, 595/9100, 837/9000]
# number of manual classified articles per class and round
man_0 = [84/100, 165/200, 247/300, 329/400, 410/500, 498/600, 586/700, 662/800, 741/900, 821/1000]
man_1 = [3/100, 7/200, 12/300, 16/400, 20/500, 22/600, 23/700, 29/800, 37/900, 39/1000]
man_2 = [13/100, 28/200, 41/300, 55/400, 70/500, 80/600, 91/700, 109/800, 122/900, 140/1000]
fig, ax = plt.subplots(3, 1)
# number of estimated labels per class and round
est_0 = [9873/9900, 9757/9800, 9603/9700, 9470/9600, 9735/9500, 9238/9400, 9107/9300, 8007/9200, 8064/9100, 7641/9000]
est_1 = [14/9900, 15/9800, 11/9700, 11/9600, 16/9500, 17/9400, 18/9300, 19/9200, 18/9100, 20/9000]
est_2 = [12/9900, 26/9800, 77/9700, 94/9600, 380/9500, 123/9400, 147/9300, 676/9200, 595/9100, 837/9000]
ax[0].plot(round, wrong)
ax[0].set_xlabel('# round')
ax[0].set_ylabel('# false rate')
fig, ax = plt.subplots(3, 1)
ax[1].plot(round, man_0, round, man_1, round, man_2)
ax[1].set_ylabel('# manually labeled')
ax[0].plot(round, wrong)
ax[2].set_xlabel('Iteration number')
ax[0].set_ylabel('Error rate')
ax[2].plot(round, est_0, round, est_1, round, est_2)
ax[2].set_ylabel('# estimated articles')
ax[1].plot(round, man_0, round, man_1, round, man_2)
ax[1].set_ylabel('Fraction of manual labels')
fig.tight_layout()
ax[2].plot(round, est_0, round, est_1, round, est_2)
ax[2].set_ylabel('Fraction of estimated labels')
#plt.savefig('..\\visualization\\Labeling_1.png')
plt.show()
# limit x axis
ax[0].set_xbound(lower=1, upper=9)
ax[1].set_xbound(lower=1, upper=9)
ax[2].set_xbound(lower=1, upper=9)
ax[0].set_ybound(lower=0)
ax[1].set_ybound(lower=0)
#ax[2].set_ybound(lower=0)
# insert legend
ax[1].legend(('class 0', 'class 1', 'class 2'))
ax[2].legend(('class 0', 'class 1', 'class 2'))
#cxy, f = axs[1].cohere(s1, s2, 256, 1. / dt)
fig.tight_layout()
# format axis labels for thousends (e.g. '10,000')
#plt.gca().yaxis.set_major_formatter(matplotlib.ticker\
#.FuncFormatter(lambda x, p: format(int(x), ',')))
plt.savefig('..\\visualization\\Labeling_Grafik_070219.png')
plt.show()
def plot_cumulative():
# load pickle object
with open('../obj/array_class_probs.pkl', 'rb') as input:
list = pickle.load(input)
# sort list in descending order
list.sort(reverse=True)
# convert list to array
probas = np.asarray(list)
mu = 200
sigma = 25
n_bins = 50
fig, ax = plt.subplots(figsize=(8, 4))
# plot the cumulative histogram
n, bins, patches = ax.hist(probas, n_bins, normed=1, histtype='step',
cumulative=True, facecolor='darkred')
# manipulate
vals = ax.get_yticks()
ax.set_yticklabels(['{:,.1%}'.format(x / 200) for x in vals])
ax.grid(True)
ax.legend(loc='right')
#ax.set_title('Cumulative distribution of highest estimated probability')
ax.set_xlabel('Highest estimated probability')
ax.set_ylabel('Percentage of articles with this highest estimated probability')
plt.axis([0.5, 0.99, 0, 0.006])
ax.set_xbound(lower=0.5, upper=0.99)
plt.show()
if __name__ == '__main__':
LabelingPlotter.plot_labeling_rounds()

View File

@ -208,56 +208,4 @@ class MNBInteractive:
# print('number of samples of each class:')
# print()
# #print(class_counts)
# print()
######## nur für resubstitutionsfehler benötigt ########
def analyze_errors(dataset, sklearn_cv):
'''calculates resubstitution error
shows indices of false classified articles
uses Gaussian Bayes with train test split
'''
X_train_test = dataset['Title'] + ' ' + dataset['Text']
y_train_test = dataset['Label']
if sklearn_cv:
# use sklearn CountVectorizer
cv = CountVectorizer()
# fit the training data and then return the matrix
training_data = cv.fit_transform(X_train_test, y_train_test).toarray()
# transform testing data and return the matrix
testing_data = cv.transform(X_train_test).toarray()
else:
# use my own BagOfWords python implementation
stemming = True
rel_freq = True
extracted_words = BagOfWords.extract_all_words(X_train_test)
vocab = BagOfWords.make_vocab(extracted_words)
# fit the training data and return the matrix
training_data = BagOfWords.make_matrix(extracted_words,
vocab, rel_freq, stemming)
testing_data = training_data
# Naive Bayes
classifier = MultinomialNB(alpha=1.0e-10,
fit_prior=False,
class_prior=None)
# fit classifier
classifier.fit(training_data, y_train_test)
# Predict class
predictions = classifier.predict(testing_data)
print('Errors at index:')
print()
n = 0
for i in range(len(y_train_test)):
if y_train_test[i] != predictions[i]:
n += 1
print('error no.{}'.format(n))
print('prediction at index {} is: {}, but actual is: {}'
.format(i, predictions[i], y_train_test[i]))
print(X_train_test[i])
print(y_train_test[i])
print()
#print metrics
print('F1 score: ', format(f1_score(y_train_test, predictions)))
# print()

Binary file not shown.

Before

Width:  |  Height:  |  Size: 54 KiB

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 46 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 27 KiB