{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Prediction of waiting time using R" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from sklearn.ensemble import RandomForestRegressor\n", "from sklearn.linear_model import LinearRegression\n", "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn.model_selection import cross_val_score\n", "from sklearn.model_selection import cross_validate\n", "from sklearn import metrics\n", "from sklearn.metrics import make_scorer\n", "\n", "import xgboost as xgb\n", "\n", "import pandas as pd\n", "\n", "import numpy as np\n", "\n", "np.random.seed(0)\n", "\n", "%matplotlib inline\n", "import matplotlib as mpl\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0sexboroughyear_entryref_fromref_toareaofcarewtimeentryage
013970225601201216330455178.908333
126125101200833113876669.394447
2256217151012013211115390.963889
373732510120106114911.122222
43294425101201061138283.269447
\n", "
" ], "text/plain": [ " Unnamed: 0 sex borough year_entry ref_from ref_to areaofcare wtime \\\n", "0 139702 2 5601 2012 16 3 304 551 \n", "1 261 2 5101 2008 33 1 138 766 \n", "2 256217 1 5101 2013 21 1 115 39 \n", "3 7373 2 5101 2010 6 1 149 1 \n", "4 32944 2 5101 2010 6 1 138 2 \n", "\n", " entryage \n", "0 78.908333 \n", "1 69.394447 \n", "2 0.963889 \n", "3 1.122222 \n", "4 83.269447 " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "WL=pd.read_csv('valpos.csv', sep=',')\n", "WL.dropna\n", "WL.head()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['Unnamed: 0', 'sex', 'borough', 'year_entry', 'ref_from', 'ref_to',\n", " 'areaofcare', 'wtime', 'entryage'], dtype=object)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "WL.columns.values" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0sexboroughyear_entryref_fromref_toareaofcarewtimeentryageis_train
013970225601201216330455178.908333True
126125101200833113876669.394447True
2256217151012013211115390.963889True
373732510120106114911.122222True
43294425101201061138283.269447True
\n", "
" ], "text/plain": [ " Unnamed: 0 sex borough year_entry ref_from ref_to areaofcare wtime \\\n", "0 139702 2 5601 2012 16 3 304 551 \n", "1 261 2 5101 2008 33 1 138 766 \n", "2 256217 1 5101 2013 21 1 115 39 \n", "3 7373 2 5101 2010 6 1 149 1 \n", "4 32944 2 5101 2010 6 1 138 2 \n", "\n", " entryage is_train \n", "0 78.908333 True \n", "1 69.394447 True \n", "2 0.963889 True \n", "3 1.122222 True \n", "4 83.269447 True " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "WL['is_train'] = np.random.uniform(0, 1, len(WL)) <= .75\n", "WL.head()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": true }, "outputs": [], "source": [ "train, test = WL[WL['is_train']==True], WL[WL['is_train']==False]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Observations for training: 3757\n", "Observations for testing: 1236\n" ] } ], "source": [ "print('Observations for training:', len(train))\n", "print('Observations for testing:',len(test))" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Unnamed: 0',\n", " 'sex',\n", " 'borough',\n", " 'year_entry',\n", " 'ref_from',\n", " 'ref_to',\n", " 'areaofcare',\n", " 'wtime',\n", " 'entryage',\n", " 'is_train']" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# get a list of the columns\n", "col_list = list(WL)\n", "col_list" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# use this handy way to swap the elements\n", "col_list[7], col_list[8] = col_list[8], col_list[7]\n", "# assign back, the order will now be swapped\n", "WL.columns = col_list" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0sexboroughyear_entryref_fromref_toareaofcareentryagewtimeis_train
2523357421251012015323524471.088890False
1237347229251012015201802872.669441True
331314356625101201214123197975.219444True
3764295860251012014232446654.150002False
9822224391510120131723310575.266670True
2699458932256012016213801621.605556True
4057227591251012013155193630.130556True
4089799925101201024113812171.505554True
25344650912510120161417842856.025002False
467656371510120116199138345.469444True
\n", "
" ], "text/plain": [ " Unnamed: 0 sex borough year_entry ref_from ref_to areaofcare \\\n", "2523 357421 2 5101 2015 3 2 35 \n", "1237 347229 2 5101 2015 20 1 80 \n", "3313 143566 2 5101 2012 14 1 231 \n", "3764 295860 2 5101 2014 23 2 44 \n", "982 222439 1 5101 2013 17 2 33 \n", "2699 458932 2 5601 2016 21 3 80 \n", "4057 227591 2 5101 2013 15 5 19 \n", "4089 7999 2 5101 2010 24 1 138 \n", "2534 465091 2 5101 2016 14 17 84 \n", "467 65637 1 5101 2011 6 1 99 \n", "\n", " entryage wtime is_train \n", "2523 244 71.088890 False \n", "1237 28 72.669441 True \n", "3313 979 75.219444 True \n", "3764 66 54.150002 False \n", "982 105 75.266670 True \n", "2699 16 21.605556 True \n", "4057 36 30.130556 True \n", "4089 121 71.505554 True \n", "2534 28 56.025002 False \n", "467 1383 45.469444 True " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "WL.sample(10)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['sex', 'borough', 'year_entry', 'ref_from', 'ref_to', 'areaofcare'], dtype='object')" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "features = WL.columns[1:7]\n", "features" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "RF = RandomForestRegressor(max_features=2, n_estimators=300)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,\n", " max_features=2, max_leaf_nodes=None, min_impurity_decrease=0.0,\n", " min_impurity_split=None, min_samples_leaf=1,\n", " min_samples_split=2, min_weight_fraction_leaf=0.0,\n", " n_estimators=300, n_jobs=1, oob_score=False, random_state=None,\n", " verbose=0, warm_start=False)" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "RF.fit(train[features], train['wtime'])" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sexboroughyear_entryref_fromref_toareaofcare
0256012012163304
1251012008331138
2151012013211115
325101201061149
425101201061138
\n", "
" ], "text/plain": [ " sex borough year_entry ref_from ref_to areaofcare\n", "0 2 5601 2012 16 3 304\n", "1 2 5101 2008 33 1 138\n", "2 1 5101 2013 21 1 115\n", "3 2 5101 2010 6 1 149\n", "4 2 5101 2010 6 1 138" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train[features].head()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", " \"\"\"Entry point for launching an IPython kernel.\n" ] } ], "source": [ "train['wtime'] = train['wtime'].astype('float32')" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 551.0\n", "1 766.0\n", "2 39.0\n", "3 1.0\n", "4 2.0\n", "Name: wtime, dtype: float32" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train['wtime'].head()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('sex', 0.042382835374565803),\n", " ('borough', 0.087496141690705811),\n", " ('year_entry', 0.1597124004227839),\n", " ('ref_from', 0.24746011770094545),\n", " ('ref_to', 0.071196461045226353),\n", " ('areaofcare', 0.39175204376577216)]" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list(zip(train[features], RF.feature_importances_))" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'clf' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mfeaturesImportance\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m'variables'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mfeatures\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'importance'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mclf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfeature_importances_\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msort_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mby\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'importance'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mascending\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mfeaturesImportance\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mNameError\u001b[0m: name 'clf' is not defined" ] } ], "source": [ "featuresImportance = pd.DataFrame({'variables':features,'importance':clf.feature_importances_}).sort_values(by='importance',ascending=False)\n", "featuresImportance" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "plt.barh(featuresImportance.variables,featuresImportance.importance)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" } }, "nbformat": 4, "nbformat_minor": 2 }