In [ ]:
#!/usr/bin/env/python
import tweepy
import time as tm
import random as rd
# you first need to register for key information to use Twitter's API
CONSUMER_KEY = ''
CONSUMER_SECRET = ''
ACCESS_TOKEN = ''
ACCESS_TOKEN_SECRET = ''
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.secure = True
auth.set_access_token(ACCESS_TOKEN,ACCESS_TOKEN_SECRET)
api = tweepy.API(auth, wait_on_rate_limit=True,wait_on_rate_limit_notify=True)
# Now periodically check the user's twitter feed for new info.
ct = 20 # only pull this many tweets (change this to whatever you want)
newest = []
new_tweets = []
username = '@NYCTSubway'
while 1!=0:
if not newest: # if newest is empty, there is no previous tweet
# makes an initial request for most recent tweets
new_tweets = api.user_timeline(username, count = ct)
else:
new_tweets = api.user_timeline(username, since_id = newest)
if not new_tweets: # if there are no new tweets (or no tweets at all)
print('no new tweets')
else:
outtweets = ([[tweet.id_str, tweet.created_at, tweet.text.encode("utf-8")]
for tweet in new_tweets])
# print out some relevant tweet information
for j in range(0,len(outtweets)):
twt = outtweets[j]
print(twt)
# save the id of the newest tweet plus one (plus 1 to avoid overlap)
newest = new_tweets[0].id + 1
# sleep for a bit to avoid rate limit errors
tm.sleep(rd.randint(180,240))
In [1]:
# import some basic stuff
import pandas as pd
import numpy as np
from copy import deepcopy
from scipy import interp
# stuff for plotting
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# read a database from CSV and load it into a pandas dataframe
file = 'directory for training data csv file'
data = pd.read_csv(file)
del data['id'] #delete the 'id' column because I don't want that in the model
In [2]:
data.head(5)
Out[2]:
In [3]:
plt.hist(pd.to_numeric(data['delay']))
plt.xlabel('minutes')
plt.ylabel('occurrences')
print('median of delay time is ' + str(np.median(data['delay'])) + ' minutes')
In [4]:
# This is a classification problem, so here I discretize the data
y = data['delay']
def discretize_response(y):
ystr = []
for i in range(0,len(y)):
if y[i] < 15:
s = 0 #'less than 15'
else:
s = 1 #'greater than 10'
ystr.append(s)
# display the distribution of positive (>=15 minutes) and negative (<15 minutes) classes
y = pd.DataFrame({'delay':ystr})
return y
y = discretize_response(y)
print(y['delay'].value_counts())
In [6]:
# do one-hot-encoding for categorical features
X = data.ix[:,'structure':'routeZ']
# print the original list of features
print("Original features:\n", list(X.columns), "\n")
X_dummies = pd.get_dummies(X,columns=['structure', 'event_month','event_weekday',
'direction','what_happened','event_hour'])
del X_dummies['structure_Embankment'] #has to be done to match the test set
# print the list of features after creating dummy variables
print("Features after get_dummies:\n", list(X_dummies.columns))
In [7]:
# here's the function to consolidate the dummy features
def consolidate_features(X):
# reorganize the features
#X = deepcopy(X)
# consolidate months down to seasons
X['winter'] = X['event_month_12'] + X['event_month_1'] + X['event_month_2']
X['spring'] = X['event_month_3'] + X['event_month_4'] + X['event_month_5']
X['summer'] = X['event_month_6'] + X['event_month_7'] + X['event_month_8']
X['autumn'] = X['event_month_9'] + X['event_month_10'] + X['event_month_11']
# consolidate times of day to rush hour morning / evening, work hours, evening / sleep hours
# notice, hours are in GMT
X['time_rush_hour_eve'] = X['event_hour_19'] + X['event_hour_20'] + X['event_hour_21'] #GMT
X['time_rush_hour_morn'] = X['event_hour_10'] + X['event_hour_11'] + X['event_hour_12'] #GMT
X['time_work'] = (X['event_hour_13'] + X['event_hour_14'] + X['event_hour_15'] +
X['event_hour_16'] + X['event_hour_17'] + X['event_hour_18']) #GMT
X['time_eve'] = X['event_hour_22'] + X['event_hour_23'] + X['event_hour_0']
X['time_sleep'] = (X['event_hour_1'] + X['event_hour_2'] + X['event_hour_3'] +
X['event_hour_4'] + X['event_hour_5'] + X['event_hour_6'] +
X['event_hour_7'] + X['event_hour_8'] + X['event_hour_9'])
# consolidate day of week into weekday / weekend
X['weekday'] = (X['event_weekday_0'] + X['event_weekday_1'] + X['event_weekday_2'] +
X['event_weekday_3'] + X['event_weekday_4'])
X['weekend'] = X['event_weekday_5'] + X['event_weekday_6']
# remove unnecessary data
# I removed routes because those were highly correlated to the station locations
cols = [c for c in X.columns if c.lower()[:5] != 'event']
X = X[cols]
cols = [c for c in X.columns if c.lower()[:5] != 'route']
X = X[cols]
return X
X_dummies = consolidate_features(X_dummies)
# print the new list of consolidated features
print('the new feature names are: ')
list(X_dummies)
Out[7]:
In [8]:
# import stuff for building and testing model
from sklearn.metrics import roc_curve, auc, accuracy_score, precision_score, f1_score, classification_report
from sklearn.cross_validation import StratifiedKFold
from sklearn.linear_model import RidgeClassifier
# apply regularization to the classifier (I tried many values -- 30 seemed best)
classifier = RidgeClassifier
# normalize the data (this mainly applies to the station geographic coordinates)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_dummies)
X_dummies_scaled = scaler.transform(X_dummies)
# use SMOTE for creating balanced data sets (oversample the minority class)
from imblearn.over_sampling import SMOTE
smote = SMOTE(kind = "regular")
X_dummies_scaled_sm, y_sm = smote.fit_sample(X_dummies_scaled, y)
In [10]:
# function for plotting ROC curves for each validation fold
def plot_regularization(X,y):
# check for optimal regularization
alphas = [0.01,0.03,0.1,0.3,1,3,10,30,100,300,1000,3000]
# prepare to run classifier with cross-validation
cv = StratifiedKFold(y, n_folds=5)
# create an empty array to hold model performance metrics
f1scores = np.zeros((5,len(alphas)))
# loop through validation folds
for i, (train, val) in enumerate(cv):
# loop through regularization values
for j, alpha in enumerate(alphas):
# fit to the training data and validate for the ith fold
ridge = classifier(alpha).fit(X[train], y[train])
# get performance metrics
y_pred = ridge.predict(X[val])
f1scores[i][j] = f1_score(y[val], y_pred, average="macro")
plt.plot(np.log10(alphas),f1scores[i][:], marker='o', label=('fold ' + str(i)))
#plt.ylim([0.5,0.75])
plt.xlim([np.log10(alphas[0]),np.log10(100000)])
plt.xlabel('log (alpha)')
plt.ylabel('F1 score')
plt.title('F1 scores')
plt.legend(loc="lower right")
# plot out the effect of regularization on the F1 score
plot_regularization(X_dummies_scaled_sm, y_sm)
In [12]:
# function for plotting ROC curves for each validation fold
def plot_cv_roc(X,y):
# prepare to run classifier with cross-validation
cv = StratifiedKFold(y, n_folds=5)
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
for i, (train, val) in enumerate(cv):
# fit to the training data and validate for the ith fold
# set the regularization parameter, alpha, based on previous plot
probas_ = classifier(alpha=300).fit(X[train], y[train]).decision_function(X[val])
# Compute ROC curve and area under the curve
fpr, tpr, thresholds = roc_curve(y[val], probas_)
# use to calculate the mean ROC
mean_tpr += interp(mean_fpr, fpr, tpr)
mean_tpr[0] = 0.0
# plot the ROC for the ith fold
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))
# plot the chance ROC
plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')
# plot the mean ROC
mean_tpr /= len(cv)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, 'k--',
label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
# plot the ROC curves for each fold
plot_cv_roc(X_dummies_scaled_sm, y_sm)
In [13]:
# cross validation suggests the model is decent --
# now use all the data to create the model
classifier = RidgeClassifier(alpha=300) #use optimum regularization, alpha, as determined above
final_model = classifier.fit(X_dummies_scaled_sm, y_sm)
In [16]:
# read in hold-out test data set, and do all the preprocessing that was done before
# on the training data
file = 'directory for hold-out test data csv file'
data = pd.read_csv(file)
del data['id']
data.shape
# discretize the response variable
y_test = data['delay']
y_test = discretize_response(y_test)
# convert categorical data to dummy variables
X_test = data.ix[:,'structure':'routeZ']
X_dummies_test = pd.get_dummies(X_test,columns=['structure', 'event_month','event_weekday',
'direction','what_happened','event_hour'])
#del X_dummies_test['structure_Embankment'] #has to be done to match the test set
# consolidate the dummy variables
X_dummies_test = consolidate_features(X_dummies_test)
# scale the test data according to the training data
X_dummies_scaled_test = scaler.transform(X_dummies_test)
# print size of the test data set (samples by features)
print(X_dummies_scaled_test.shape)
In [17]:
pred_test = final_model.predict(X_dummies_scaled_test)
# print ridge regression accuracy on test data
print("ridge regression, test data: ")
print(classification_report(y_test, pred_test,
target_names=["less than 15","15 or greater"]))
print('final model accuracy: %0.2f' % accuracy_score(y_test,pred_test))
In [18]:
# print out the weights of the classifier
y1 = np.transpose(classifier.coef_)
x = np.arange(len(y1))
plt.figure(num=None, figsize=(10, 8), dpi=80, facecolor='w', edgecolor='k')
ylab = X_dummies.columns[0:-1]
plt.barh(x,y1)
plt.yticks(x+0.5,ylab)
plt.ylabel("feature")
plt.xlabel("theta")
Out[18]:
Comments
Post a Comment