Data Visualization with R Programming

Kuroun Seung
4 min readJan 29, 2018

Download the Divvy Bikes dataset analytics data from http://www.divvybikes.com/datachallenge

Initializing Dataset and Setting Frequently Used Variables

  • To bind trips dataset:
tripQ1_Q2 <- read.csv("Divvy_Trips_2014_Q1Q2.csv")
tripQ3_07 <- read.csv("Divvy_Trips_2014-Q3-07.csv")
tripQ3_0809 <- read.csv("Divvy_Trips_2014-Q3-0809.csv")
tripQ4 <- read.csv("Divvy_Trips_2014-Q4.csv")
trip <- rbind(tripQ1_Q2, tripQ3_07, tripQ3_0809, tripQ4)
  • To bind stations dataset:
station1 <- read.csv("Divvy_Stations_2014-Q1Q2.csv")
station2 <- read.csv("Divvy_Stations_2014-Q3Q4.csv")
  • Convert date time string to hour only:
hour <- format(strptime(trip$starttime,format='%m/%d/%Y %H:%M'), '%H')
  • Function to determine season by input date:
getSeason <- function(DATES) {
WS <- as.Date("12/15/2012", format = "%m/%d/%Y") # Winter
SE <- as.Date("3/15/2012", format = "%m/%d/%Y") # Spring
SS <- as.Date("6/15/2012", format = "%m/%d/%Y") # Summer
FE <- as.Date("9/15/2012", format = "%m/%d/%Y") # Fall
d <- as.Date(paste(format(strptime(DATES,format="%m/%d/%Y"),"%m/%d"), "2012",sep="/"),format = "%m/%d/%Y")
ifelse (d >= WS | d < SE, "Winter",
ifelse (d >= SE & d < SS, "Spring",
ifelse (d >= SS & d < FE, "Summer", "Fall")))
}
  • Finding dataset of weekday: (number of trips per weekday) and re-order it none alphabetically:
dayofweek <- weekdays(strptime(trip$starttime,format="%m/%d/%Y %H:%M"))
dayofweek <- data.frame(dayofweek)
dayofweek$dayofweek <- factor(dayofweek$dayofweek, levels= c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
dayofweek <- dayofweek[order(dayofweek$dayofweek), ]

Basic Chart

  • Number of Trips by Hour
num_of_trips_by_hour = table(hour)
barplot(num_of_trips_by_hour)
  • Number of Trips by Weekday
num_trips_by_wd = table(weekdays(strptime(trip$starttime,format="%m/%d/%Y %H:%M")))
num_trips_by_wd <- data.frame(num_trips_by_wd)
num_trips_by_wd $Var1 <- factor(num_trips_by_wd $Var1, levels= c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
num_trips_by_wd <- num_trips_by_wd [order(num_trips_by_wd $Var1), ]
barplot(as.matrix(num_trips_by_wd[,2]), main="Number of Trips by Weekday", ylab= "Number of Trips",beside=TRUE, col= rainbow(7),names=num_trips_by_wd$Var1)
  • Number of Trips by User Type
num_trips_by_utype <- table(trip$usertype)
user_percentage <- round(num_trips_by_utype /sum(num_trips_by_utype)*100,2)
labels <- c('Customer','Subscriber')
pie(num_trips_by_type, label=paste(labels,paste(user_percentage,"%")))
  • Number of Trips by Station Pair: find the top 10 station pairs which has the most number of trips in 2014
num_trips_by_sp <- table(trip$from_station_name, trip$to_station_name)
num_trips_by_sp <- data.frame(num_trips_by_sp)
colnames(num_trips_by_sp) <-c("from_station","to_station","trip")
num_trips_by_sp1<-head(num_trips_by_sp[order(-num_trips_by_sp$trip),],10)
bp_num_sp <- barplot(as.matrix(num_trips_by_sp1[,3]), main="Top 10 Most Station Pairs", ylab= "Number of Trips",xlab="Stations",beside=TRUE, col= rainbow(10))
text(bp_num_sp, 50, paste(num_trips_by_sp1$from_station, num_trips_by_sp1$to_station, sep=' - '), cex=1, pos=4, srt=80)
  • Number of Trips by Bike ID: find the top 30 bikes used in 2014
num_trips_by_bike <- table(trip$bikeid)
num_trips_by_bike <- data.frame(num_trips_by_bike)
colnames(num_trips_by_bike) <- c("bike", "trip")
num_trips_by_bike1 <- head(num_trips_by_bike[order(-num_trips_by_bike$trip),],30)
bp_num_bike <- barplot(as.matrix(num_trips_by_bike1[,2]), main="Top 30 Most Bikes Used", ylab= "Number of Trips",xlab="Bikes",beside=TRUE, col= rainbow(30))
text(bp_num_bike, 50, num_trips_by_bike1$bike, cex=1, pos=4, srt=90)

Cross Tabulate Chart

  • Number of Trips by Hour X DayOfWeek
hour_x_dayofweek = table(hour, dayofweek)
barplot(as.matrix(hour_x_dayofweek), main="Hour X WeekDay", ylab= "Number of Trips",beside=TRUE, col= rainbow(24))
legend("top", c('00','01','02','03','04','05','06','07','08','09','10','11','12','13','14','15','16','17','18','19','20','21','22','23'), cex=0.6, bty="n", fill=rainbow(24),ncol=4)
  • Number of Trips by UserType X Season
seasons <- getSeason(trip$starttime)
user_types <- trip$usertype
barplot(as.matrix(usertype_x_season), main="UserType X Season", ylab= "Number of Trips",beside=TRUE, col= rainbow(2))
legend("topleft", c('Customer', 'Subscriber'), cex=0.8, bty="n", fill=rainbow(2))
  • Number of Trips by UserType X DayOfWeek
user_type_x_dayofweek = table(user_types, dayofweek)
barplot(as.matrix(user_type_x_dayofweek), main="UserType X DayOfWeek", ylab= "Number of Trips",beside=TRUE, col= rainbow(2))
legend("topleft", c('Customer', 'Subscriber'), cex=0.8, bty="n", fill=rainbow(2))
  • Number of Trips by StationPair X DayOfWeek: find the most station pairs used each weekday in 2014
temp <- data.frame(stationpair_x_dayofweek)
temp1 <- aggregate(temp$Freq,list(temp$from_station,temp$to_station,temp$dayofweek),FUN=sum)
temp2 <- data.frame(from=temp1$Group.1,to=temp1$Group.2,day=temp1$Group.3,trip=temp1$x)
temp3<-by(data = temp2, INDICES = list(temp2$day), function(x) x[which.max(x$trip), ])
temp4 <- do.call(rbind, temp3)
x <- barplot(as.matrix(temp4[,4]), main="StationPair X WeekDay", ylab= "Number of Trips",beside=TRUE, col= rainbow(7),names=temp4$day)
text(x, 50, paste(temp4$from,temp4$to,sep=' - '),cex=1,pos=4,srt=45)

Chart with Third party API: Google Map

  • Which top 10 stations are the most active (start trip by user)?
map1<-data.frame(table(trip$from_station_name))
map2<-data.frame(from_station=map1$Var1,trip=map1$Freq)
map3<-merge(map2,station,by.x="from_station",by.y="name")
map4<-head(map3[order(-map3$trip),],10)
install.packages(‘ggmap’)
library(ggmap)
geocode("chicago")
map5<- get_map(location = c (-87.6298, 41.87811), source = "google", zoom = 12, maptype = "satellite")
map6<- ggmap(map5,extend=TRUE)+geom_point(data=map4, aes(map4$longitude,map4$latitude),size=1,color="red") + geom_text(data = map4, aes(x = map4$longitude, y = map4$latitude, label = paste(map4$from_station,map4$trip,sep="-")), size = 2,hjust = -0.1,color='white')
ggplot_build(map6)

Reference

--

--