Exploratory Data Analysis of NYC Airbnb Listing Info and Demographics

Keren Xu

2019/12/31

Background and summary of my data

Airbnb is an online marketplace where people rent their unused rooms or entire properties. Instead of owning any of the real estate listings, the company acts as a broker, receiving commissions from each booking. It has been growing rapidly since 2008, and now it has over seven million accommodations in more than 191 countries and regions globally 1. In order to understand its distribution and corresponding determinates in a specific city, we studied the relationship between Airbnb listing price per night in New York City and this city’s demographics and socioeconomics.

Datasets used in this project include New York City Airbnb Open Data, New York City Census Data and New York City 2010 Census Tracts shape file. New York City Airbnb Open Data is a public dataset that can be downloaded from Inside Airbnb 2, a website that provides listing information compiled from Airbnb. This dataset contains the listing activity and metrics on Airbnb in New York City for 2019. Through this dataset, we can find information on hosts, neighborhoods, locations, room types, prices, and reviews. New York City Census Data is a public dataset from Kaggle 3, containing one csv files, nyc_census_tracts.csv. The nyc_census_tracts.csv was taken from the American Community Survey 2015 5-year estimates DP03 and DP05 tables 4. Variables include total population, racial/ethnic demographic information, employment and commuting characteristics, etc. New York City 2010 Census Tracts shape file was downloaded from the NYC OpenData website 5. This website holds free public data published by New York City agencies and other partners.

In this project, I explored New York City Airbnb data and socioeconomic characteristics.

Methods and results

load datasets

library(htmltools)
library(tidyverse)
library(lubridate)
library(rgdal)  
library(ggplot2)
library(tableone)
library(scales)
library(ggmap)
library(corrplot)
library(tigris)
library(leaflet) # plotting spatial data
library(sf) # handle simple features objects
library(units) # set_units() for converting units objects


theme_set(theme_light())  

listing <- read.csv("../../static/csv/listings.csv") %>% select(-c("id", "host_id", "name", "host_name"))


print(paste('DataFrame contains ', dim(listing)[1], 'rows and ', dim(listing)[2], 'columns.'))
print(paste('There are ', sum(is.na(listing)), ' missing values representing ', 
           round(100*sum(is.na(listing))/(nrow(listing)*ncol(listing)), 5), '% of the total.'))
print('Variable names are as follows:')
print(names(listing))
head(listing)  
 

nyc.df <- read_csv("../../static/csv/nyc_census_tracts.csv")  
  

print(paste('DataFrame contains ', dim(nyc.df)[1], 'rows and ', dim(nyc.df)[2], 'columns.'))
print(paste('There are ', sum(is.na(nyc.df)), ' missing values representing ', 
           round(100*sum(is.na(nyc.df))/(nrow(nyc.df)*ncol(nyc.df)), 5), '% of the total.'))
print('Variable names are as follows:')
print(names(nyc.df))
head(nyc.df)

check out incomplete data

#function to return number of columns containing NAs
#base::complete.cases() operates on rows
na_col_count <- function(df){
    na.v <- sapply(df, function(x) any(is.na(x)))
    sum(na.v)
}                 
     

#function to return dataframe of columns by NA count
#trim=F also displays complete columns
na_col_sums <- function(df, trim=T){
    nacols.df <- as.data.frame(sapply(df, function(x) sum(is.na(x))))
    names(nacols.df) <- 'NAs'                         
    nacols.df <- nacols.df[order(-nacols.df$NAs), , drop=F] 
#default drop=T returns vector from 1-col df subset
    nacols.df$Percent_of_DFrows <- round(100*nacols.df$NAs/nrow(df), 2)
    nacols.df$Percent_of_NArows <- round(100*nacols.df$NAs/sum(!complete.cases(df)), 2)
    names(nacols.df) <- c('NAs', 'Percent of Column', 'Percent of Incomplete Rows')
   if(!trim){return(nacols.df)}else{ 
       nacols.df <- nacols.df[nacols.df$NAs != 0, ]
       nacols.df
   }                           
}

print(paste('Number of rows containing NA values: ', sum(!complete.cases(listing))))
print(paste('Number of columns containing NA values: ', na_col_count(listing)))
               
print(paste('Number of rows containing NA values: ', sum(!complete.cases(nyc.df))))
print(paste('Number of columns containing NA values: ', na_col_count(nyc.df)))

 
na_col_sums(listing) 
na_col_sums(nyc.df)   

# replace NA with 0  
listing <- listing %>% replace_na(list(reviews_per_month = 0))
na_col_sums(listing)  # check again    

# replace the variable neighbourhood_group with borough  
listing <- listing %>% rename(borough = neighbourhood_group) 

#function to filter rows containing NAs
#lim = 0.2 will filter rows where 20% of columns are NA or less, etc. Default lim=0
#keep=T returns the removed rows instead of the cleaned dataframe
row_nafilter <- function(df, keep=F, lim=0){
    row_v <- apply(df, 1, function(x) sum(is.na(x))/length(x) <= lim)
    if(keep){
        df[!row_v, ]
    }else{
        df[row_v, ]
    }
}

nyc_na.df <- row_nafilter(nyc.df, keep=T) # return the removed rows
nyc.df <- row_nafilter(nyc.df, lim=0.2) # return the cleaned dataframe
print(paste('Count of NA values in nyc_df: ', sum(is.na(nyc.df))))

Exploratory data analysis for the Airbnb dataset: We first tested the distribution of the outcome variable price. We also summarized all the listing variables in this dataset. Frequency and percentage have been summarized for categorical variables. Mean and standard deviation (SD) have been summarized for continuous variables. Spearman correlation coefficients were computed for each pair of variables and correlation matrix was presented in heatmap.

listing %>% count(borough, neighbourhood, sort = TRUE) 

listing %>% count(borough, neighbourhood, sort = TRUE) %>% filter(n > 1000) %>%  ggplot(aes(x = neighbourhood, y = n)) + geom_bar(stat="identity") + coord_flip()  

listing %>% count(borough, sort = TRUE)  %>% ggplot(aes(x = borough, y = n)) + geom_bar(stat = "identity") + coord_flip()    

listing %>% count(room_type)

# get variable names  
dput(names(listing))


# variables that I would like to summarize  
myVars <- c("borough", "neighbourhood", "room_type", "price", "minimum_nights", "number_of_reviews", "reviews_per_month", "availability_365")  

## Vector of categorical variables that need transformation
catVars <- c("borough", "neighbourhood", "room_type")  

## Create a TableOne object 
tab1 <- CreateTableOne(vars = myVars, data = listing, factorVars = catVars)  
summary(tab1)  

## stratified by borough  

myVars <- c("room_type", "price", "minimum_nights", "number_of_reviews", "reviews_per_month", "availability_365")  
catVars <- c("room_type")  
tab2 <- CreateTableOne(vars = myVars, strata = "borough" , data = listing, factorVars = catVars)
summary(tab2)   

# histogram for price  
listing %>% filter(price < 500) %>% ggplot(aes(price)) + geom_histogram() 

# histogram for price stratified by borough  
listing %>% filter(price < 500) %>% ggplot(aes(price)) + geom_histogram() + facet_wrap(~ borough)   

# histogram for price stratified by room type  
listing %>% filter(price < 500) %>% ggplot(aes(price)) + geom_histogram() + facet_wrap(~ room_type)    

# histogram for minimum_nights  

listing %>% filter(minimum_nights < 20) %>% ggplot(aes(minimum_nights)) + geom_histogram()

listing %>% filter(minimum_nights < 20) %>% ggplot(aes(minimum_nights)) + geom_histogram() + facet_wrap(~ borough)

listing %>% filter(minimum_nights < 20) %>% ggplot(aes(minimum_nights)) + geom_histogram() + facet_wrap(~ room_type)

listing %>% filter(minimum_nights > 100) %>% ggplot(aes(minimum_nights)) + geom_histogram()

listing %>% filter(minimum_nights > 100) %>% ggplot(aes(minimum_nights)) + geom_histogram() + facet_wrap(~ borough)

listing %>% filter(minimum_nights > 100) %>% ggplot(aes(minimum_nights)) + geom_histogram() + facet_wrap(~ room_type)

# histogram for number of review  
listing %>% ggplot(aes(number_of_reviews)) + geom_histogram()  

listing %>% ggplot(aes(number_of_reviews)) + geom_histogram() + facet_wrap(~ borough)  

listing %>% ggplot(aes(number_of_reviews)) + geom_histogram()  

listing %>% ggplot(aes(number_of_reviews)) + geom_histogram() + facet_wrap(~ room_type)  

# histogram for reviews_per_month  
listing %>% ggplot(aes(reviews_per_month)) + geom_histogram()    

listing %>% ggplot(aes(reviews_per_month)) + geom_histogram() + facet_wrap(~ borough)  

listing %>% ggplot(aes(reviews_per_month)) + geom_histogram() + facet_wrap(~ room_type)     

# histogram for reviews_per_month  
listing %>% ggplot(aes(availability_365)) + geom_histogram()    

listing %>% ggplot(aes(availability_365)) + geom_histogram() + facet_wrap(~ borough)  

listing %>% ggplot(aes(availability_365)) + geom_histogram() + facet_wrap(~ room_type)  

# boxplots for continuous variables  
listing %>% filter(price < 500) %>% ggplot(aes(y = price, x = borough)) + geom_boxplot()

listing %>% filter(minimum_nights < 20) %>% ggplot(aes(y = minimum_nights, x = borough)) + geom_boxplot()  

listing %>% filter(minimum_nights > 100) %>% ggplot(aes(y = minimum_nights, x = borough)) + geom_boxplot()  

listing %>% ggplot(aes(y = number_of_reviews, x = borough)) + geom_boxplot()  

listing %>% ggplot(aes(y = reviews_per_month, x = borough)) + geom_boxplot()  

listing %>% ggplot(aes(y = availability_365, x = borough)) + geom_boxplot()  

I mapped the point data of the original Airbnb dataset, based on several variables, including borough, neighborhood, room type, and price per night.

listing %>% ggplot(aes(x = longitude, y = latitude, color= borough)) + geom_point(alpha = 0.4) 

listing %>% ggplot(aes(x = longitude, y = latitude, color= neighbourhood)) + geom_point(alpha = 0.4) + theme(legend.position = "none")  

listing %>% ggplot(aes(x = longitude, y = latitude, color= room_type)) + geom_point(alpha = 0.4)

listing %>% ggplot(aes(x = longitude, y = latitude, color=price)) + geom_point(alpha = 0.6) + scale_colour_gradient2(low = "grey", mid = "blue", high = "red", midpoint = 5) 

listing %>% filter(price < 500) %>%  ggplot(aes(x = longitude, y = latitude, color=price)) + geom_point(alpha = 0.6) + scale_colour_gradient2(low = "grey", mid = "blue", high = "red", midpoint = 100)    

listing %>% ggplot(aes(x = longitude, y = latitude, color= availability_365)) + geom_point(alpha = 0.6) + scale_colour_gradient2(low = "grey", mid = "blue", high = "red", midpoint = 100) 

listing %>%
  ggplot(aes(longitude, latitude, color = price)) +
  borders("county", regions = "New York") +
  geom_point() +
  scale_color_gradient2(low = "blue",
                        high = "red",
                        midpoint = log10(median(listing$price)),
                        trans = "log10",
                        labels = comma_format()) +
  coord_map() +
  theme_void()

listing %>% ggplot(aes(x = longitude, y = latitude, color=price)) + geom_point() + scale_colour_gradient2(low = "blue",
                        high = "red",
                        midpoint = log10(median(listing$price)),
                        trans = "log10",
                        labels = comma_format())

listing %>% ggplot(aes(x = longitude, y = latitude, color=minimum_nights)) + geom_point() + scale_colour_gradient2(low = "blue",
                        high = "red",
                        midpoint = log10(median(listing$minimum_nights)),
                        trans = "log10",
                        labels = comma_format())

Correlation heatmap

# Spearman correlation will be used since I'm not interested particularly in linear relationships.
listing_cor <- listing[, sapply(listing, is.numeric)]
correlation_matrix <- cor(listing_cor, method = "spearman")
corrplot(correlation_matrix, method = "color")  

Tidy and merge data: Since we only have coordinates in Airbnb dataset, but census tracts in census dataset, we first converted all the coordinates into census tracts using function call_geolocator_latlon() in R package tigris. Since Airbnb dataset has more than 40000 rows, we used parallel computation to speed up the process via packages foreach and doSNOW. Then we calculated the average values of the outcome variables for each census tract. Also calculated these mean values for each room type (entire house/apartments, private room, hotel, and shared room). Only the number of review column has missing data, we gave value 0 to the missing data. Census dataset has several rows with missing data. We removed rows that with over 20% of missing columns. After data cleaning, we merged the Airbnb dataset, the census dataset, and the census shape file together by variable CensusTracts.

names(listing)  
library(data.table)
library(parallel)

coord <- data.frame(lat = listing$latitude, long = listing$longitude)
coord <- data.table(coord)  

# parallel processing
library(foreach)
library(doSNOW)
cl <- makeCluster(8, type="SOCK") # for 8 cores machine
registerDoSNOW (cl)
# parallelization
census_code <- foreach(i = 1:nrow(coord), .packages='tigris') %dopar% {
call_geolocator_latlon(coord$lat[i], coord$long[i])
}   

# use .combine to combine all the lists 

census_code_vector <- foreach(i = 1:nrow(coord), .combine = 'c') %dopar% {
census_code[[i]]
}


# As I understand it, the 15 digit code is several codes put together (the first two being the state, next three the county, and the following six the tract). To get just the census tract code I'd just use the substr function to pull out those six digits.  
coord$census_code <- census_code_vector 

coord$census_tract <- substr(coord$census_code, 1, 11)  
View(coord)  
View(listing)  
class(listing)

write.csv(coord, "coord_census_tract.csv")  

listing$census_tract <- coord$census_tract 

write.csv(listing, "listingCensusTract.csv")  

stopCluster(cl)  

Calculate listing information for each census tract

listing <- read_csv("../../static/csv/listingCensusTract.csv") %>% 
  select(-c("X1", "latitude", "longitude", "calculated_host_listings_count", "last_review")) 

listing %>% count(census_tract, sort = TRUE) 

names(listing)  

listing_price <- listing %>% group_by(census_tract, room_type) %>%
  summarize(price_mean = mean(price, na.rm = TRUE)) %>% 
  spread(room_type, price_mean) %>% 
  rename(priceEHA = `Entire home/apt`, priceHR = `Hotel room`, pricePR = `Private room`, priceSR = `Shared room`) 

listing_minimum_nights <- listing %>% group_by(census_tract, room_type) %>%
  summarize(minimum_nights_mean = mean(minimum_nights, na.rm = TRUE)) %>% 
  spread(room_type, minimum_nights_mean) %>% 
  rename(mnEHA = `Entire home/apt`, mnHR = `Hotel room`, mnPR = `Private room`, mnSR = `Shared room`)


listing_number_of_reviews <- listing %>% group_by(census_tract, room_type) %>%
  summarize(number_of_reviews_mean = mean(number_of_reviews, na.rm = TRUE)) %>% 
  spread(room_type, number_of_reviews_mean) %>% 
  rename(nrEHA = `Entire home/apt`, nrHR = `Hotel room`, nrPR = `Private room`, nrSR = `Shared room`)

listing_reviews_per_month <- listing %>% group_by(census_tract, room_type) %>%
  summarize(reviews_per_month_mean = mean(reviews_per_month, na.rm = TRUE)) %>% 
  spread(room_type, reviews_per_month_mean) %>% 
  rename(rpmEHA = `Entire home/apt`, rpmHR = `Hotel room`, rpmPR = `Private room`, rpmSR = `Shared room`)


listing_availability_365 <- listing %>% group_by(census_tract, room_type) %>%
  summarize(availability_365_mean = mean(availability_365, na.rm = TRUE)) %>% 
  spread(room_type, availability_365_mean) %>% 
  rename(a365EHA = `Entire home/apt`, a365HR = `Hotel room`, a365PR = `Private room`, a365SR = `Shared room`)  

listing_room_type <- listing %>% group_by(census_tract) %>% 
  summarize(n_house = sum(room_type == "Entire home/apt"), n_hotel = sum(room_type == "Hotel room"), n_private = sum(room_type == "Private room"), n_shared = sum(room_type == "Shared room"), n = n())

listingCensusClean <- left_join(listing_price, listing_minimum_nights, by = "census_tract") %>% left_join(., listing_number_of_reviews, by = 'census_tract') %>% 
  left_join(., listing_reviews_per_month, by = 'census_tract') %>% 
  left_join(., listing_availability_365, by = 'census_tract') %>% 
  left_join(., listing_room_type, by = 'census_tract')  


write.csv(listingCensusClean, "listingCensusClean.csv")  

Calculate NYC_census information for each census tract

nyc.df

listing <- read_csv("../../static/csv/listingCensusClean.csv")  %>% select(-"X1") %>% rename(CensusTract = census_tract)
head(listing) 


mydata <- listing %>% inner_join(., nyc.df, by = "CensusTract") %>% select(-"County")

write.csv(mydata, "mydata.csv")   

Working with Shapefiles

mydata <- read_csv("../../static/csv/mydata.csv") %>% select(-"X1") 

nyc_map <- st_read("../../static/csv/2010 Census Tracts/geo_export_9e1c8422-1515-444e-b137-5d3f3831316c.shp")

nyc_map <- nyc_map %>% rename(CensusTract = boro_ct201)  

mydata$CensusTract <- substr(mydata$CensusTract, 6, 11)   

mydata <- mydata %>%
  mutate(
    boro_code = case_when(
      Borough == "Bronx" ~ "2",
      Borough == "Manhattan" ~ "1",
      Borough == "Brooklyn" ~ "3",
      Borough == "Queens" ~ "4",
      TRUE   ~  "5"
    )
  ) %>% 
  unite("CensusTract", c(boro_code, CensusTract), sep = "", remove = T) 

mydata_clean <- nyc_map %>% inner_join(., mydata , by = "CensusTract")  

mydata_clean <- mydata_clean %>% mutate(PctWomen = Women/TotalPop, PctMen = Men/TotalPop, PctCitizen = Citizen/TotalPop, PctEmployed = Employed/TotalPop)

# st_write(mydata_clean, "mydata_clean.shp")      

Leaflet maps

# mydata_clean <- st_read("mydata_clean.shp")   

n.pal = colorNumeric(c('blue','white','red'), domain=mydata_clean$n)  

mydata_clean %>% 
  leaflet() %>% 
  setView(lat = 40.7829, lng = -73.9654, zoom = 10) %>% 
  addProviderTiles("CartoDB.DarkMatter", options = providerTileOptions(minZoom = 9)) %>% 
  addPolygons(fillColor=~n.pal(n), weight = 0.1,
  opacity = 1,
  color = "#ffd700",
  fillOpacity = 0.9,
  highlight = highlightOptions(
    weight = 2,
    color = "#ffd700",
    fillOpacity = 0.7,
    bringToFront = TRUE),
  labelOptions = labelOptions(
    style = list("font-weight" = "normal", padding = "3px 8px"),
    textsize = "15px",
    direction = "auto"), 
  label=~paste0(ntaname, ': ', n)
  ) %>% 
  addLegend('topleft', pal=n.pal, values=mydata_clean$n,
            title='Airbnb listings in NYC', opacity=1)