Analysing tube journeys with Folium


This post analyses 2 years worth of Oyster card data totalling over 750 journeys across 69 tube stations. You can view your own journey history if you've registered your Oyster card with TfL.

In [1]:
## import the required packages
from __future__ import division
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.rcParams['figure.figsize'] = (10.0, 6.0)
import re
from shapely.geometry import Point
import geopandas as gpd
import folium
from folium import plugins
from textwrap import wrap
In [2]:
## you can read in your own data here
df = pd.read_excel('../../../Dropbox/tube.xlsx')

# we're only interested in tube journeys so let's ignore bus journeys and top-up information
ignore = ['Auto top-up', 'Topped up', 'Topped-up', 'Bus journey', 'Entered']
df = df[~df.journey.str.contains('|'.join(ignore))].reset_index(drop=True)

The raw data provided by TfL contains the date, start and end times of the tube journey as well as the fare and card balance at the time of the journey.

In [3]:
df.sample(10)
Out[3]:
date start end journey charge credit balance note
241 2017-03-27 08:34:00 08:54:00 Earls Court to Temple 2.4 NaN 13.20 NaN
419 2016-12-05 15:28:00 15:53:00 Earls Court to Waterloo (Jubilee line entrance) 0.6 NaN 26.70 NaN
665 2016-04-08 22:12:00 22:21:00 South Kensington to West Kensington 0.6 NaN 2.70 NaN
199 2017-04-26 08:27:00 08:46:00 Earls Court to Paddington (Bakerloo, Circle/Di... 2.4 NaN 20.20 NaN
205 2017-04-22 10:43:00 10:59:00 Earls Court to Edgware Road (District, Circle,... 1.6 NaN 9.90 NaN
463 2016-11-14 15:25:00 15:46:00 Earls Court to Waterloo (Jubilee line entrance) 0.6 NaN 15.55 NaN
41 2017-08-11 20:16:00 20:30:00 Earls Court to Edgware Road (District, Circle,... 1.6 NaN 11.75 NaN
531 2016-10-07 09:36:00 09:56:00 Earls Court to Temple 0.6 NaN 22.80 NaN
347 2017-01-31 20:18:00 20:26:00 South Kensington to Earls Court 0.6 NaN 21.75 NaN
300 2017-02-27 08:35:00 08:59:00 Earls Court to Temple 2.4 NaN 10.80 NaN

Let's apply some transformations to extract the relevant information from the raw data.

In [4]:
## Append dates and times and convert to timestamp objects
df['start_time'] = (df.date.astype(str) + ' ' + df.start.astype(str)).astype('datetime64[ns]')
df['end_time'] = (df.date.astype(str) + ' ' + df.end.astype(str)).astype('datetime64[ns]')

## Calculate duration of journey in minutes
df['duration'] = (df.end_time - df.start_time).apply(lambda x: pd.Timedelta(0) if x.days != 0 else x)
df['minutes'] = df.duration.apply(lambda x: x.components.minutes + 60*x.components.hours)

## Remove strings between brackets i.e. [],() in journey column
df['journey'] = df.journey.apply(lambda x: re.sub("[\(\[].*?[\)\]]", "", x))
df['start_station'] = df.journey.apply(lambda x: x.split(' to ')[0].strip())
df['end_station'] = df.journey.apply(lambda x: x.split(' to ')[-1].strip())

Now let's join the journey dataset with another dataset that has the geographic coordinates of every underground station obtained from here.

In [5]:
stations = pd.read_csv('../../../Dropbox/stations.csv')

## convert lat/lon coordinates to Shapely Point objects
stations['location'] = [Point(i) for i in zip(stations.lat, stations.lon)]
del stations['lat']
del stations['lon']

## convert station dataframe to a Geopandas dataframe
stations = gpd.GeoDataFrame(stations, geometry='location', crs={'init' :'epsg:4326'})
In [6]:
df = df.merge(stations, how='left', left_on='start_station', right_on='station')
df = df.merge(stations, how='left', left_on='end_station', right_on='station', suffixes=('_start', '_end'))
del df['station_start']
del df['station_end']
In [7]:
# folium doesn't render strings with apostrophes
stations.loc[:, 'station'] = stations.station.str.replace("'", '')

For each journey we now have the start/end times, the fare and the start/end station names and their coordinates.

In [8]:
df.sample(10)
Out[8]:
date start end journey charge credit balance note start_time end_time duration minutes start_station end_station zone_start location_start zone_end location_end
682 2016-03-20 11:29:00 11:55:00 West Kensington to Marylebone 0.60 NaN 4.20 NaN 2016-03-20 11:29:00 2016-03-20 11:55:00 00:26:00 26 West Kensington Marylebone 2 POINT (51.49010936 -0.206203695) 1 POINT (51.52239667 -0.163492564)
684 2016-03-19 14:49:00 15:00:00 West Kensington to South Kensington 0.60 NaN 5.40 NaN 2016-03-19 14:49:00 2016-03-19 15:00:00 00:11:00 11 West Kensington South Kensington 2 POINT (51.49010936 -0.206203695) 1 POINT (51.49407137 -0.173923023)
354 2017-01-28 20:40:00 21:09:00 Earls Court to Marylebone 0.60 NaN 25.95 NaN 2017-01-28 20:40:00 2017-01-28 21:09:00 00:29:00 29 Earls Court Marylebone 1,2 POINT (51.49135667 -0.194313809) 1 POINT (51.52239667 -0.163492564)
251 2017-03-22 08:33:00 08:49:00 Earls Court to Paddington 2.40 NaN 9.90 NaN 2017-03-22 08:33:00 2017-03-22 08:49:00 00:16:00 16 Earls Court Paddington 1,2 POINT (51.49135667 -0.194313809) 1 POINT (51.51539379 -0.175736752)
609 2016-06-15 16:32:00 17:27:00 Prince Regent DLR to West Kensington 0.85 NaN 0.25 NaN 2016-06-15 16:32:00 2016-06-15 17:27:00 00:55:00 55 Prince Regent DLR West Kensington 3 POINT (51.50943455 0.033488574) 2 POINT (51.49010936 -0.206203695)
17 2017-08-31 08:25:00 08:51:00 Earls Court to Paddington 2.40 NaN 21.35 NaN 2017-08-31 08:25:00 2017-08-31 08:51:00 00:26:00 26 Earls Court Paddington 1,2 POINT (51.49135667 -0.194313809) 1 POINT (51.51539379 -0.175736752)
554 2016-09-27 17:12:00 17:45:00 Earls Court to Old Street 0.60 NaN 21.00 NaN 2016-09-27 17:12:00 2016-09-27 17:45:00 00:33:00 33 Earls Court Old Street 1,2 POINT (51.49135667 -0.194313809) 1 POINT (51.52558147 -0.087622958)
76 2017-07-14 17:32:00 18:00:00 Earls Court to Oval 2.90 NaN 23.70 NaN 2017-07-14 17:32:00 2017-07-14 18:00:00 00:28:00 28 Earls Court Oval 1,2 POINT (51.49135667 -0.194313809) 2 POINT (51.4814126 -0.113321076)
291 2017-03-03 10:50:00 11:08:00 Earls Court to Temple 1.60 NaN 14.90 NaN 2017-03-03 10:50:00 2017-03-03 11:08:00 00:18:00 18 Earls Court Temple 1,2 POINT (51.49135667 -0.194313809) 1 POINT (51.51104061 -0.11372575)
25 2017-08-23 20:15:00 20:28:00 Paddington to Earls Court 1.60 NaN 18.15 NaN 2017-08-23 20:15:00 2017-08-23 20:28:00 00:13:00 13 Paddington Earls Court 1 POINT (51.51539379 -0.175736752) 1,2 POINT (51.49135667 -0.194313809)

Data exploration

In [9]:
print "Total number of journeys: {}".format(len(df))
print "Total number of unique stations visited: {}".format(np.unique(df[['start_station', 'end_station']].values).size)
print "Total number of unique journeys: {}".format(df.journey.nunique())
print "Total amount spent on tube journeys: £{}0".format(df.charge.sum())
print "Total time spent in a tube station: {} days".format(round(df.minutes.sum()/60/24, 2))
Total number of journeys: 768
Total number of unique stations visited: 69
Total number of unique journeys: 155
Total amount spent on tube journeys: £960.80
Total time spent in a tube station: 10.36 days
In [10]:
labels = df.start_station.value_counts()[:9].index
labels = ['\n'.join(wrap(l, 11)) for l in labels]

ax = df.start_station.value_counts()[:9].plot('bar', rot=0)
ax.set_xticklabels(labels)
plt.ylabel('frequency')
plt.tight_layout()
plt.title('Most popular departure stations');
In [11]:
labels = df.journey.value_counts()[:10].index
labels = ['\n'.join(wrap(l, 11)) for l in labels]

ax = df.journey.value_counts()[:10].plot('bar', rot=0)
ax.set_xticklabels(labels)
plt.tight_layout()
plt.ylabel('frequency')
plt.yticks(np.arange(0, 100, 10))
plt.title('Most frequent journeys');
In [12]:
df.date.dt.weekday_name.value_counts().plot('bar', rot=True)
plt.ylabel('frequency')
plt.title('Number of journeys by day of the week')
plt.tight_layout();
In [13]:
df.groupby(df.start_time.dt.hour).count()['start_time'].plot('bar', rot=True)
plt.ylabel('frequency')
plt.title('Number of journeys by hour of the day')
plt.tight_layout();
In [14]:
df.minutes.hist(bins=df.minutes.max() - df.minutes.min())
plt.xlim(0, 65)
plt.ylim(0, 50)
plt.xlabel('Journey length (mins)')
plt.ylabel('frequency')
plt.xticks(np.arange(0, 70, 5))
plt.title('Distribution of journey times')
plt.tight_layout();

Folium maps

All the tube stations I've visited

In [15]:
## get list of all visited stations along with their coordinates
visited_stations = stations[stations.station.isin(np.unique(df[['start_station', 'end_station']].values))]

m = folium.Map(tiles='cartodbpositron', location=(51.51, -0.1), zoom_start=12)

for i in visited_stations.itertuples():
    folium.Marker(location = i.location.coords[0],
                  popup = i.station,
                  icon = folium.Icon(color='red')).add_to(m)

# add a fullscreen button
plugins.Fullscreen(
    position='topright',
    title='Fullscreen',
    title_cancel='Exit',
    force_separate_button=True).add_to(m)

m
Out[15]:

Plot most common routes

In [16]:
def plot_routes(n=10):
    """
    Plot n most common routes.
    """

    m = folium.Map(tiles='cartodbpositron', location=(51.51, -0.15), zoom_start=13)

    routes = df.groupby(['start_station', 'end_station']).size().sort_values(ascending=False).head(n)
    for i in routes.iteritems():

        folium.PolyLine(
            [stations[stations.station == i[0][0]].location.iloc[0].coords[0], 
             stations[stations.station == i[0][1]].location.iloc[0].coords[0]], 
             weight = np.log(i[1] + 0.5), 
             popup = '{} to {}, {} journeys'.format(i[0][0], i[0][1], i[1]),
             opacity = 1, 
             color = 'black').add_to(m)

    ## plot all unique start and end stations
    s = set([i[0][0] for i in routes.head(n).iteritems()] + [i[0][1] for i in routes.head(n).iteritems()])
    for i in stations[stations.station.isin(s)].itertuples():
        folium.Marker(location = i.location.coords[0],
                      popup = i.station,
                      icon = folium.Icon(color='blue')).add_to(m)

    # add a fullscreen button
    plugins.Fullscreen(
        position='topright',
        title='Fullscreen',
        title_cancel='Exit',
        force_separate_button=True).add_to(m)

    return m
In [17]:
plot_routes()
Out[17]:

All stations visited from a single station

In [18]:
def station_network(s = 'Earls Court'):
    '''
    Returns map of all destination stations departing from `s`.
    Thickness of lines is a function of the frequency of journeys.
    Average journey time obtained from clicking on relevant line.
    '''

    ## Plot journey start station as red flag
    start_station = stations[stations.station == s]
    if len(start_station) == 0:
        return 'No journeys from this station.'
    
    m = folium.Map(tiles='cartodbpositron', location=start_station.location.iloc[0].coords[0], zoom_start=13)
    folium.Marker(location = start_station.location.iloc[0].coords[0],
                  popup = start_station.station.iloc[0],
                  icon = folium.Icon(color='red')).add_to(m)

    ## Obtain all destinations from this start station with average journey time
    end_stations = df[df.start_station == s].groupby('end_station').agg({'end_station': 'count', 
                                                                         'location_end': 'min',
                                                                         'minutes': 'mean'})
    
    for i in end_stations.itertuples():

        folium.PolyLine(
            [start_station.location.iloc[0].coords[0], 
             i.location_end.coords[0]], 
             weight = np.log(i.end_station + 0.5), 
             popup = '{} journeys, average time: {} mins'.format(i.end_station, round(i.minutes, 1)),
             opacity = 1, 
             color = 'black').add_to(m)

        folium.Marker(location = i.location_end.coords[0],
                      popup = i.Index,
                      icon = folium.Icon(color='blue')).add_to(m)

    # add a fullscreen button
    plugins.Fullscreen(
        position='topright',
        title='Fullscreen',
        title_cancel='Exit',
        force_separate_button=True).add_to(m)

    return m
In [19]:
station_network()
Out[19]:

Animation of every journey taken in sequential order

In [20]:
m = folium.Map(tiles='cartodbpositron', location=(51.51, -0.1), zoom_start=12)

## convert times to unix timestamp objects
times = [int(i/1000000) for i in df.start_time.values.tolist()]
plugins.TimestampedGeoJson({
     'type': 'FeatureCollection',
     'features': [
       {'type': 'Feature',
        'geometry': {
           'type': 'LineString',
           'coordinates': df.location_start.apply(lambda x: x.coords[0][::-1]).values.tolist()[::-1]},
         'properties': {'times': times[::-1]}}]},
    
    transition_time=500, 
    period='P1W',
).add_to(m)

m
Out[20]: