# Prediction of waiting time using R

In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn import metrics
from sklearn.metrics import make_scorer

import xgboost as xgb

import pandas as pd

import numpy as np

np.random.seed(0)

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

In [5]:
WL=pd.read_csv('valpos.csv', sep=',')
WL.dropna
WL.head()

Unnamed: 0.1,Unnamed: 0,sex,borough,year_entry,ref_from,ref_to,areaofcare,wtime,entryage
0,139702,2,5601,2012,16,3,304,551,78.908333
1,261,2,5101,2008,33,1,138,766,69.394447
2,256217,1,5101,2013,21,1,115,39,0.963889
3,7373,2,5101,2010,6,1,149,1,1.122222
4,32944,2,5101,2010,6,1,138,2,83.269447


In [6]:
WL.columns.values

array(['Unnamed: 0', 'sex', 'borough', 'year_entry', 'ref_from', 'ref_to',
       'areaofcare', 'wtime', 'entryage'], dtype=object)

In [7]:
WL['is_train'] = np.random.uniform(0, 1, len(WL)) <= .75
WL.head()

Unnamed: 0.1,Unnamed: 0,sex,borough,year_entry,ref_from,ref_to,areaofcare,wtime,entryage,is_train
0,139702,2,5601,2012,16,3,304,551,78.908333,True
1,261,2,5101,2008,33,1,138,766,69.394447,True
2,256217,1,5101,2013,21,1,115,39,0.963889,True
3,7373,2,5101,2010,6,1,149,1,1.122222,True
4,32944,2,5101,2010,6,1,138,2,83.269447,True


In [8]:
train, test = WL[WL['is_train']==True], WL[WL['is_train']==False]

In [9]:
print('Observations for training:', len(train))
print('Observations for testing:',len(test))

Observations for training: 3757
Observations for testing: 1236


In [10]:
# get a list of the columns
col_list = list(WL)
col_list

['Unnamed: 0',
 'sex',
 'borough',
 'year_entry',
 'ref_from',
 'ref_to',
 'areaofcare',
 'wtime',
 'entryage',
 'is_train']

In [11]:
# use this handy way to swap the elements
col_list[7], col_list[8] = col_list[8], col_list[7]
# assign back, the order will now be swapped
WL.columns = col_list

In [12]:
WL.sample(10)

Unnamed: 0.1,Unnamed: 0,sex,borough,year_entry,ref_from,ref_to,areaofcare,entryage,wtime,is_train
2523,357421,2,5101,2015,3,2,35,244,71.08889,False
1237,347229,2,5101,2015,20,1,80,28,72.669441,True
3313,143566,2,5101,2012,14,1,231,979,75.219444,True
3764,295860,2,5101,2014,23,2,44,66,54.150002,False
982,222439,1,5101,2013,17,2,33,105,75.26667,True
2699,458932,2,5601,2016,21,3,80,16,21.605556,True
4057,227591,2,5101,2013,15,5,19,36,30.130556,True
4089,7999,2,5101,2010,24,1,138,121,71.505554,True
2534,465091,2,5101,2016,14,17,84,28,56.025002,False
467,65637,1,5101,2011,6,1,99,1383,45.469444,True


In [13]:
features = WL.columns[1:7]
features

Index(['sex', 'borough', 'year_entry', 'ref_from', 'ref_to', 'areaofcare'], dtype='object')

In [14]:
RF = RandomForestRegressor(max_features=2, n_estimators=300)

In [15]:
RF.fit(train[features], train['wtime'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=2, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=300, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [16]:
train[features].head()

Unnamed: 0,sex,borough,year_entry,ref_from,ref_to,areaofcare
0,2,5601,2012,16,3,304
1,2,5101,2008,33,1,138
2,1,5101,2013,21,1,115
3,2,5101,2010,6,1,149
4,2,5101,2010,6,1,138


In [17]:
train['wtime'] = train['wtime'].astype('float32')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [18]:
train['wtime'].head()

0    551.0
1    766.0
2     39.0
3      1.0
4      2.0
Name: wtime, dtype: float32

In [19]:
list(zip(train[features], RF.feature_importances_))

[('sex', 0.042382835374565803),
 ('borough', 0.087496141690705811),
 ('year_entry', 0.1597124004227839),
 ('ref_from', 0.24746011770094545),
 ('ref_to', 0.071196461045226353),
 ('areaofcare', 0.39175204376577216)]

In [20]:
featuresImportance = pd.DataFrame({'variables':features,'importance':clf.feature_importances_}).sort_values(by='importance',ascending=False)
featuresImportance

NameError: name 'clf' is not defined

In [None]:
plt.barh(featuresImportance.variables,featuresImportance.importance)