{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Prediction of waiting time using R"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.ensemble import RandomForestRegressor\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.model_selection import cross_val_score\n",
"from sklearn.model_selection import cross_validate\n",
"from sklearn import metrics\n",
"from sklearn.metrics import make_scorer\n",
"\n",
"import xgboost as xgb\n",
"\n",
"import pandas as pd\n",
"\n",
"import numpy as np\n",
"\n",
"np.random.seed(0)\n",
"\n",
"%matplotlib inline\n",
"import matplotlib as mpl\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Unnamed: 0 | \n",
" sex | \n",
" borough | \n",
" year_entry | \n",
" ref_from | \n",
" ref_to | \n",
" areaofcare | \n",
" wtime | \n",
" entryage | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 139702 | \n",
" 2 | \n",
" 5601 | \n",
" 2012 | \n",
" 16 | \n",
" 3 | \n",
" 304 | \n",
" 551 | \n",
" 78.908333 | \n",
"
\n",
" \n",
" 1 | \n",
" 261 | \n",
" 2 | \n",
" 5101 | \n",
" 2008 | \n",
" 33 | \n",
" 1 | \n",
" 138 | \n",
" 766 | \n",
" 69.394447 | \n",
"
\n",
" \n",
" 2 | \n",
" 256217 | \n",
" 1 | \n",
" 5101 | \n",
" 2013 | \n",
" 21 | \n",
" 1 | \n",
" 115 | \n",
" 39 | \n",
" 0.963889 | \n",
"
\n",
" \n",
" 3 | \n",
" 7373 | \n",
" 2 | \n",
" 5101 | \n",
" 2010 | \n",
" 6 | \n",
" 1 | \n",
" 149 | \n",
" 1 | \n",
" 1.122222 | \n",
"
\n",
" \n",
" 4 | \n",
" 32944 | \n",
" 2 | \n",
" 5101 | \n",
" 2010 | \n",
" 6 | \n",
" 1 | \n",
" 138 | \n",
" 2 | \n",
" 83.269447 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Unnamed: 0 sex borough year_entry ref_from ref_to areaofcare wtime \\\n",
"0 139702 2 5601 2012 16 3 304 551 \n",
"1 261 2 5101 2008 33 1 138 766 \n",
"2 256217 1 5101 2013 21 1 115 39 \n",
"3 7373 2 5101 2010 6 1 149 1 \n",
"4 32944 2 5101 2010 6 1 138 2 \n",
"\n",
" entryage \n",
"0 78.908333 \n",
"1 69.394447 \n",
"2 0.963889 \n",
"3 1.122222 \n",
"4 83.269447 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"WL=pd.read_csv('valpos.csv', sep=',')\n",
"WL.dropna\n",
"WL.head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['Unnamed: 0', 'sex', 'borough', 'year_entry', 'ref_from', 'ref_to',\n",
" 'areaofcare', 'wtime', 'entryage'], dtype=object)"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"WL.columns.values"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Unnamed: 0 | \n",
" sex | \n",
" borough | \n",
" year_entry | \n",
" ref_from | \n",
" ref_to | \n",
" areaofcare | \n",
" wtime | \n",
" entryage | \n",
" is_train | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 139702 | \n",
" 2 | \n",
" 5601 | \n",
" 2012 | \n",
" 16 | \n",
" 3 | \n",
" 304 | \n",
" 551 | \n",
" 78.908333 | \n",
" True | \n",
"
\n",
" \n",
" 1 | \n",
" 261 | \n",
" 2 | \n",
" 5101 | \n",
" 2008 | \n",
" 33 | \n",
" 1 | \n",
" 138 | \n",
" 766 | \n",
" 69.394447 | \n",
" True | \n",
"
\n",
" \n",
" 2 | \n",
" 256217 | \n",
" 1 | \n",
" 5101 | \n",
" 2013 | \n",
" 21 | \n",
" 1 | \n",
" 115 | \n",
" 39 | \n",
" 0.963889 | \n",
" True | \n",
"
\n",
" \n",
" 3 | \n",
" 7373 | \n",
" 2 | \n",
" 5101 | \n",
" 2010 | \n",
" 6 | \n",
" 1 | \n",
" 149 | \n",
" 1 | \n",
" 1.122222 | \n",
" True | \n",
"
\n",
" \n",
" 4 | \n",
" 32944 | \n",
" 2 | \n",
" 5101 | \n",
" 2010 | \n",
" 6 | \n",
" 1 | \n",
" 138 | \n",
" 2 | \n",
" 83.269447 | \n",
" True | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Unnamed: 0 sex borough year_entry ref_from ref_to areaofcare wtime \\\n",
"0 139702 2 5601 2012 16 3 304 551 \n",
"1 261 2 5101 2008 33 1 138 766 \n",
"2 256217 1 5101 2013 21 1 115 39 \n",
"3 7373 2 5101 2010 6 1 149 1 \n",
"4 32944 2 5101 2010 6 1 138 2 \n",
"\n",
" entryage is_train \n",
"0 78.908333 True \n",
"1 69.394447 True \n",
"2 0.963889 True \n",
"3 1.122222 True \n",
"4 83.269447 True "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"WL['is_train'] = np.random.uniform(0, 1, len(WL)) <= .75\n",
"WL.head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"train, test = WL[WL['is_train']==True], WL[WL['is_train']==False]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Observations for training: 3757\n",
"Observations for testing: 1236\n"
]
}
],
"source": [
"print('Observations for training:', len(train))\n",
"print('Observations for testing:',len(test))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Unnamed: 0',\n",
" 'sex',\n",
" 'borough',\n",
" 'year_entry',\n",
" 'ref_from',\n",
" 'ref_to',\n",
" 'areaofcare',\n",
" 'wtime',\n",
" 'entryage',\n",
" 'is_train']"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# get a list of the columns\n",
"col_list = list(WL)\n",
"col_list"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# use this handy way to swap the elements\n",
"col_list[7], col_list[8] = col_list[8], col_list[7]\n",
"# assign back, the order will now be swapped\n",
"WL.columns = col_list"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Unnamed: 0 | \n",
" sex | \n",
" borough | \n",
" year_entry | \n",
" ref_from | \n",
" ref_to | \n",
" areaofcare | \n",
" entryage | \n",
" wtime | \n",
" is_train | \n",
"
\n",
" \n",
" \n",
" \n",
" 2523 | \n",
" 357421 | \n",
" 2 | \n",
" 5101 | \n",
" 2015 | \n",
" 3 | \n",
" 2 | \n",
" 35 | \n",
" 244 | \n",
" 71.088890 | \n",
" False | \n",
"
\n",
" \n",
" 1237 | \n",
" 347229 | \n",
" 2 | \n",
" 5101 | \n",
" 2015 | \n",
" 20 | \n",
" 1 | \n",
" 80 | \n",
" 28 | \n",
" 72.669441 | \n",
" True | \n",
"
\n",
" \n",
" 3313 | \n",
" 143566 | \n",
" 2 | \n",
" 5101 | \n",
" 2012 | \n",
" 14 | \n",
" 1 | \n",
" 231 | \n",
" 979 | \n",
" 75.219444 | \n",
" True | \n",
"
\n",
" \n",
" 3764 | \n",
" 295860 | \n",
" 2 | \n",
" 5101 | \n",
" 2014 | \n",
" 23 | \n",
" 2 | \n",
" 44 | \n",
" 66 | \n",
" 54.150002 | \n",
" False | \n",
"
\n",
" \n",
" 982 | \n",
" 222439 | \n",
" 1 | \n",
" 5101 | \n",
" 2013 | \n",
" 17 | \n",
" 2 | \n",
" 33 | \n",
" 105 | \n",
" 75.266670 | \n",
" True | \n",
"
\n",
" \n",
" 2699 | \n",
" 458932 | \n",
" 2 | \n",
" 5601 | \n",
" 2016 | \n",
" 21 | \n",
" 3 | \n",
" 80 | \n",
" 16 | \n",
" 21.605556 | \n",
" True | \n",
"
\n",
" \n",
" 4057 | \n",
" 227591 | \n",
" 2 | \n",
" 5101 | \n",
" 2013 | \n",
" 15 | \n",
" 5 | \n",
" 19 | \n",
" 36 | \n",
" 30.130556 | \n",
" True | \n",
"
\n",
" \n",
" 4089 | \n",
" 7999 | \n",
" 2 | \n",
" 5101 | \n",
" 2010 | \n",
" 24 | \n",
" 1 | \n",
" 138 | \n",
" 121 | \n",
" 71.505554 | \n",
" True | \n",
"
\n",
" \n",
" 2534 | \n",
" 465091 | \n",
" 2 | \n",
" 5101 | \n",
" 2016 | \n",
" 14 | \n",
" 17 | \n",
" 84 | \n",
" 28 | \n",
" 56.025002 | \n",
" False | \n",
"
\n",
" \n",
" 467 | \n",
" 65637 | \n",
" 1 | \n",
" 5101 | \n",
" 2011 | \n",
" 6 | \n",
" 1 | \n",
" 99 | \n",
" 1383 | \n",
" 45.469444 | \n",
" True | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Unnamed: 0 sex borough year_entry ref_from ref_to areaofcare \\\n",
"2523 357421 2 5101 2015 3 2 35 \n",
"1237 347229 2 5101 2015 20 1 80 \n",
"3313 143566 2 5101 2012 14 1 231 \n",
"3764 295860 2 5101 2014 23 2 44 \n",
"982 222439 1 5101 2013 17 2 33 \n",
"2699 458932 2 5601 2016 21 3 80 \n",
"4057 227591 2 5101 2013 15 5 19 \n",
"4089 7999 2 5101 2010 24 1 138 \n",
"2534 465091 2 5101 2016 14 17 84 \n",
"467 65637 1 5101 2011 6 1 99 \n",
"\n",
" entryage wtime is_train \n",
"2523 244 71.088890 False \n",
"1237 28 72.669441 True \n",
"3313 979 75.219444 True \n",
"3764 66 54.150002 False \n",
"982 105 75.266670 True \n",
"2699 16 21.605556 True \n",
"4057 36 30.130556 True \n",
"4089 121 71.505554 True \n",
"2534 28 56.025002 False \n",
"467 1383 45.469444 True "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"WL.sample(10)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['sex', 'borough', 'year_entry', 'ref_from', 'ref_to', 'areaofcare'], dtype='object')"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"features = WL.columns[1:7]\n",
"features"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"RF = RandomForestRegressor(max_features=2, n_estimators=300)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,\n",
" max_features=2, max_leaf_nodes=None, min_impurity_decrease=0.0,\n",
" min_impurity_split=None, min_samples_leaf=1,\n",
" min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
" n_estimators=300, n_jobs=1, oob_score=False, random_state=None,\n",
" verbose=0, warm_start=False)"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"RF.fit(train[features], train['wtime'])"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sex | \n",
" borough | \n",
" year_entry | \n",
" ref_from | \n",
" ref_to | \n",
" areaofcare | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 2 | \n",
" 5601 | \n",
" 2012 | \n",
" 16 | \n",
" 3 | \n",
" 304 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" 5101 | \n",
" 2008 | \n",
" 33 | \n",
" 1 | \n",
" 138 | \n",
"
\n",
" \n",
" 2 | \n",
" 1 | \n",
" 5101 | \n",
" 2013 | \n",
" 21 | \n",
" 1 | \n",
" 115 | \n",
"
\n",
" \n",
" 3 | \n",
" 2 | \n",
" 5101 | \n",
" 2010 | \n",
" 6 | \n",
" 1 | \n",
" 149 | \n",
"
\n",
" \n",
" 4 | \n",
" 2 | \n",
" 5101 | \n",
" 2010 | \n",
" 6 | \n",
" 1 | \n",
" 138 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sex borough year_entry ref_from ref_to areaofcare\n",
"0 2 5601 2012 16 3 304\n",
"1 2 5101 2008 33 1 138\n",
"2 1 5101 2013 21 1 115\n",
"3 2 5101 2010 6 1 149\n",
"4 2 5101 2010 6 1 138"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train[features].head()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
" \"\"\"Entry point for launching an IPython kernel.\n"
]
}
],
"source": [
"train['wtime'] = train['wtime'].astype('float32')"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 551.0\n",
"1 766.0\n",
"2 39.0\n",
"3 1.0\n",
"4 2.0\n",
"Name: wtime, dtype: float32"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train['wtime'].head()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('sex', 0.042382835374565803),\n",
" ('borough', 0.087496141690705811),\n",
" ('year_entry', 0.1597124004227839),\n",
" ('ref_from', 0.24746011770094545),\n",
" ('ref_to', 0.071196461045226353),\n",
" ('areaofcare', 0.39175204376577216)]"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"list(zip(train[features], RF.feature_importances_))"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'clf' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mfeaturesImportance\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m'variables'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mfeatures\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'importance'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mclf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfeature_importances_\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msort_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mby\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'importance'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mascending\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mfeaturesImportance\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mNameError\u001b[0m: name 'clf' is not defined"
]
}
],
"source": [
"featuresImportance = pd.DataFrame({'variables':features,'importance':clf.feature_importances_}).sort_values(by='importance',ascending=False)\n",
"featuresImportance"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"plt.barh(featuresImportance.variables,featuresImportance.importance)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}