Kaggle Renthop Challenge

Prediction popularity of rental listings.Exploratory Data Analysis, Feature Engineering using Geo-spatial Data, followed by build an Classification model using XGBoost
Problem Description:

In this competition, you will predict how popular an apartment rental listing is based on the listing content like text description, photos, number of bedrooms, price, etc. The data comes from renthop.com, an apartment listing website. These apartments are located in New York City. The target variable, interest_level, is defined by the number of inquiries a listing has in the duration that the listing was live on the site.
Data fields

bathrooms: number of bathrooms bedrooms: number of bathrooms building_id created description display_address features: a list of features about this apartment latitude listing_id longitude manager_id photos: a list of photo links. You are welcome to download the pictures yourselves from renthop’s site, but they are the same as imgs.zip. price: in USD street_address interest_level: this is the target variable. It has 3 categories: ‘high’, ‘medium’, ‘low’
Our Approach

Exploratory Data Analysis

Importing python libraries and Loading Data

class="highlight">
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
% matplotlib inline
import seaborn as sns
sns.set(style="whitegrid", color_codes=True)
sns.set(font_scale=1)

import plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools


from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
init_notebook_mode(connected=True)

train_df = pd.read_json("../input/two-sigma-connect-rental-listing-inquiries/train.json")
test_df = pd.read_json("../input/two-sigma-connect-rental-listing-inquiries/test.json")
The target variable
class="highlight">1
2
3
sns.countplot(train_df.interest_level, order=['low', 'medium', 'high']);
plt.xlabel('Interest Level');
plt.ylabel('Number of occurrences');
Bathrooms and Bedrooms
class="highlight">1
2
3
4
5
6
7
8
9
10
11
12
13
14
fig = plt.figure(figsize=(12,12))
### Number of occurrences
sns.countplot(train_df.bathrooms, ax = plt.subplot(221));
plt.xlabel('Number of Bathrooms');
plt.ylabel('Number of occurrences');
### Average number of Bathrooms per Interest Level
sns.barplot(x='interest_level', y='bathrooms', data=train_df, order=['low', 'medium', 'high'],
            ax = plt.subplot(222));
plt.xlabel('Interest Level');
plt.ylabel('Average Number of Bathrooms');
### Average interest for every number of bathrooms
sns.pointplot(x="bathrooms", y="interest", data=train_df, ax = plt.subplot(212));
plt.xlabel('Number of Bathrooms');
plt.ylabel('Average Interest');
class="highlight">1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
### Bedrooms graphs
fig = plt.figure(figsize=(12,12))
### Number of occurrences
sns.countplot(train_df.bedrooms, ax = plt.subplot(221));
plt.xlabel('Number of Bedrooms');
plt.ylabel('Number of occurrences');
### Average number of Bedrooms per Interest Level
sns.barplot(x='interest_level', y='bedrooms', data=train_df, order=['low', 'medium', 'high'],
            ax = plt.subplot(222));
plt.xlabel('Interest Level');
plt.ylabel('Average Number of Bedrooms');
### Average interest for every number of bedrooms
sns.pointplot(x="bedrooms", y="interest", data=train_df, ax = plt.subplot(212));
plt.xlabel('Number of Bedrooms');
plt.ylabel('Average Interest');
Interest levels on different days of the week
class="highlight">1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
### Iterest per Day of Week
fig = plt.figure(figsize=(12,6))
ax = sns.countplot(x="day_of_week", hue="interest_level",
                   hue_order=['low', 'medium', 'high'], data=train_df,
                   order=['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']);
plt.xlabel('Day of Week');
plt.ylabel('Number of occurrences');

### Adding percents over bars
height = [p.get_height() for p in ax.patches]
ncol = int(len(height)/3)
total = [height[i] + height[i + ncol] + height[i + 2*ncol] for i in range(ncol)] * 3
for i, p in enumerate(ax.patches):    
    ax.text(p.get_x()+p.get_width()/2,
            height[i] + 50,
            '{:1.0%}'.format(height[i]/total[i]),
            ha="center") 
Exploring the Price
class="highlight">1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
fig = plt.figure(figsize=(12,12))
sns.distplot(train_data.price[train_data.price<=train_data.price.quantile(0.99)], ax=plt.subplot(211));
plt.xlabel('Price');
plt.ylabel('Density');
### Average Price per Interest Level
sns.barplot(x="interest_level", y="price", order=['low', 'medium', 'high'],
            data=train_data, ax=plt.subplot(223));
plt.xlabel('Interest Level');
plt.ylabel('Price');
### Violinplot of price for every Interest Level
sns.violinplot(x="interest_level", y="price", order=['low', 'medium', 'high'],
               data=train_data[train_data.price<=train_data.price.quantile(0.99)],
               ax=plt.subplot(224));
plt.xlabel('Interest Level');
plt.ylabel('Price');
Word Clouds
Features
class="highlight">1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
from wordcloud import WordCloud

text = ''
text_da = ''
text_desc = ''
for ind, row in train_df.iterrows():
    for feature in row['features']:
        text = " ".join([text, "_".join(feature.strip().split(" "))])
    text_da = " ".join([text_da,"_".join(row['display_address'].strip().split(" "))])
    #text_desc = " ".join([text_desc, row['description']])
text = text.strip()
text_da = text_da.strip()
text_desc = text_desc.strip()

plt.figure(figsize=(12,6))
wordcloud = WordCloud(background_color='white', width=600, height=300, max_font_size=50, max_words=40).generate(text)
wordcloud.recolor(random_state=0)
plt.imshow(wordcloud)
plt.title("Wordcloud for features", fontsize=30)
plt.axis("off")
plt.show()
class="highlight">1
2
3
4
5
6
7
8
# wordcloud for display address
plt.figure(figsize=(12,6))
wordcloud = WordCloud(background_color='white', width=600, height=300, max_font_size=50, max_words=40).generate(text_da)
wordcloud.recolor(random_state=0)
plt.imshow(wordcloud)
plt.title("Wordcloud for Display Address", fontsize=30)
plt.axis("off")
plt.show()
Exploring the geographic location of all the listings
(NOTE: We have used R for mapping the locations of all the listings)
Loading necessary Libraries
library(tigris)
library(dplyr)
library(leaflet)
library(sp)
library(ggmap)
library(maptools)
library(broom)
library(httr)
library(rgdal)
loading the li
Importing New york city neighborhood data
r <- GET('http://data.beta.nyc//dataset/0ff93d2d-90ba-457c-9f7e-39e47bf2ac5f/resource/35dd04fb-81b3-479b-a074-a27a37888ce7/download/d085e2f8d0b54d4590b1e7d1f35594c1pediacitiesnycneighborhoods.geojson')
nyc_neighborhoods <- readOGR(content(r,'text'), 'OGRGeoJSON', verbose = F)
nyc_neighborhoods_df <- tidy(nyc_neighborhoods)
Plotting it neighborhood data
nyc_neighborhoods_df <- tidy(nyc_neighborhoods)
nyc_map <- get_map(location = c(lon = -74.00, lat = 40.71), maptype = "terrain", zoom = 11)
suppressMessages(ggmap(nyc_map)) + 
  geom_polygon(data=nyc_neighborhoods_df, aes(x=long, y=lat, group=group), color="blue", fill=NA)
Finding Neighborhoods of all the locations
lats <- train$latitude
lngs <- train$longitude
points <- data.frame(lat=as.numeric(lats), lng=as.numeric(lngs))
points_spdf <- points
coordinates(points_spdf) <- ~lng + lat
proj4string(points_spdf) <- proj4string(nyc_neighborhoods)
matches <- over(points_spdf, nyc_neighborhoods)
points <- cbind(points, matches)
Plotting the distirbution of the listings
points <- train[c('lat','lng','neighborhood','boroughCode','borough','X.id')]
points_by_neighborhood <- points %>%
  group_by(neighborhood) %>%
  summarize(num_points=n())

map_data <- geo_join(nyc_neighborhoods, points_by_neighborhood, "neighborhood", "neighborhood")
pal <- colorNumeric(palette = "RdBu", domain = range(map_data@data$num_points, na.rm=T))

plot_data <- tidy(nyc_neighborhoods, region="neighborhood") %>%
  left_join(., points_by_neighborhood, by=c("id"="neighborhood")) %>%
  filter(!is.na(num_points))
  nyc_map <- get_map(location = c(lon = -74.00, lat = 40.71), maptype = "terrain", zoom = 10)
  
ggmap(nyc_map) + 
  geom_polygon(data=plot_data, aes(x=long, y=lat, group=group, fill=num_points),colour='black', alpha=0.75)
Exploting transportation options for each listing
Plotting the subway data of NYC
library(geosphere)
subway <- GET('https://data.cityofnewyork.us/api/views/kk4q-3rt2/rows.csv?accessType=DOWNLOAD')
subway_data <- read.csv(subway)
train$latitude <- as.numeric(train$latitude)
train$longitude <- as.numeric(train$longitude)
test$latitude <- as.numeric(test$latitude)
test$longitude <- as.numeric(test$longitude)
subway_data <- read.csv('subway.csv')
nyc_map <- get_map(location = c(lon = -74.00, lat = 40.71), maptype = "terrain", zoom = 11)
ggmap(nyc_map) +
  geom_point(data = subway_data, aes(x = longitude, y = latitude, fill = "red", alpha = 1), size = 2,shape = 21) +
  guides(fill=FALSE, alpha=FALSE, size=FALSE)
Creating a database of average rent in each neighborhood
grp_cols <- c('neighborhood','bedrooms')

# Convert character vector to list of symbols
dots <- lapply(grp_cols, as.symbol)

# Perform frequency counts
area_database <- all_data %>%
    group_by_(.dots=dots) %>%
    summarise(price = median(price),n=n())
write.csv(area_database,file = 'area_database.csv')    
Feature Engineering
(NOTE: Back to python)
Importing the necessary libraries
class="highlight">1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import pandas as pd
import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.model_selection import train_test_split
import xgboost as xgb
import random
from sklearn import model_selection, preprocessing, ensemble
from sklearn.preprocessing import Imputer
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import ExtraTreesClassifier
Converting ‘created’ column into a datatime object
class="highlight">1
2
train_df["created"] = pd.to_datetime(train_df["created"])
test_df["created"] = pd.to_datetime(test_df["created"])
Extracting Additional features from datetime object
class="highlight">1
2
3
4
5
6
7
8
train_df["created_year"] = train_df["created"].dt.year
test_df["created_year"] = test_df["created"].dt.year

train_df["created_month"] = train_df["created"].dt.month
test_df["created_month"] = test_df["created"].dt.month

train_df["created_day"] = train_df["created"].dt.day
test_df["created_day"] = test_df["created"].dt.day
Calculating the average price for similar house in the neighborhood (similar number of bedrooms)
class="highlight">1
2
3
4
5
area_database = pd.read_csv('area_database.csv')
def get_neigborhood_avg(row):
    return float(area_database.loc[(area_database.neighborhood==row['neighborhood']) & (area_database.bedrooms==row['bedrooms'])].price)
train_df['neighborhood_avg'] = train_df.apply(get_neigborhood_avg, axis=1)
test_df['neighborhood_avg'] = test_df.apply(get_neigborhood_avg, axis=1)
calculating the price difference between the listing and market rate
class="highlight">1
2
3
4
train_df['price_difference'] = train_df['price'] - train_df['neighborhood_avg']
test_df['price_difference'] = test_df['price'] - test_df['neighborhood_avg']
train_df['relative_price'] = train_df['price_difference']/train_df['neighborhood_avg']
test_df['relative_price'] = test_df['price_difference']/test_df['neighborhood_avg']
few additional features
class="highlight">1
2
3
4
5
6
7
8
9
10
11
# count of photos #
train_df["num_photos"] = train_df["photos"].apply(len)
test_df["num_photos"] = test_df["photos"].apply(len)

# count of "features" #
train_df["num_features"] = train_df["features"].apply(len)
test_df["num_features"] = test_df["features"].apply(len)

# count of words present in description column #
train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
test_df["num_description_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))
Label encoding the categorical features
class="highlight">1
2
3
4
5
6
7
8
categorical = ["display_address", "manager_id", "building_id", "street_address",'neighborhood']
for f in categorical:
        if train_df[f].dtype=='object':
            #print(f)
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(train_df[f].values) + list(test_df[f].values))
            train_df[f] = lbl.transform(list(train_df[f].values))
            test_df[f] = lbl.transform(list(test_df[f].values))
Dealing with ‘Features’ column
Features column has a list representing the features of the listing, so we combine all the strings to-gether and apply a count vectorizer on top of it.
class="highlight">1
2
3
4
5
6
7
8
train_df['features'].fillna("",inplace=True)
test_df['features'].fillna("",inplace=True)
train_df['features'] = train_df["features"].apply(lambda x: " ".join(["_".join(x.split(" "))]))
test_df['features'] = test_df["features"].apply(lambda x: " ".join(["_".join(x.split(" "))]))
tfidf = CountVectorizer(stop_words='english', max_features=200)
tfidf.fit(list(train_df['features']) + list(test_df['features']))
tr_sparse = tfidf.transform(train_df["features"])
te_sparse = tfidf.transform(test_df["features"])
Dealing with the missing values
class="highlight">1
2
3
4
5
6
7
8
fill_NaN = Imputer(missing_values=np.nan, strategy='mean', axis=1)
train_imputed = pd.DataFrame(fill_NaN.fit_transform(train_df[features_to_use]))
train_imputed.columns = train_df[features_to_use].columns
train_imputed.index = train_df.index

test_imputed = pd.DataFrame(fill_NaN.fit_transform(test_df[features_to_use]))
test_imputed.columns = test_df[features_to_use].columns
test_imputed.index = test_df.index
Building the final dataset by stacking densely and sparsely populated features into one dataset
class="highlight">1
2
train_X = sparse.hstack([train_imputed, tr_sparse]).tocsr()
test_X = sparse.hstack([test_imputed, te_sparse]).tocsr()
converting the target variable
class="highlight">1
2
3
target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
print(train_X.shape, test_X.shape)
Machine Learning
Building an XGBOOST model
class="highlight">1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=1000):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.1
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model
Cross validation and training the model
class="highlight">1
2
3
4
5
6
7
8
9
cv_scores = []
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2016)
for dev_index, val_index in kf.split(range(train_X.shape[0])):
        dev_X, val_X = train_X[dev_index,:], train_X[val_index,:]
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        preds, model = runXGB(dev_X, dev_y, val_X, val_y)
        cv_scores.append(log_loss(val_y, preds))
        print(cv_scores)
        break
Model stopped after 854 iterations with train-mlogloss:0.37921 test-mlogloss:0.522394
Predicting on the test set
class="highlight">1
2
3
4
5
preds, model = runXGB(train_X, train_y, test_X, num_rounds=400)
out_df = pd.DataFrame(preds)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test_df.listing_id.values
out_df.to_csv("predictions.csv", index=False)
Note: XBG was inspired from a notebook on kaggle by SRK.
Project Link: Click here to view full notebook on github
 
About This Site
 The portfolio and technical blog of Akshay Kumar Vikram – provider of professional Data analytics, Data Visualization, Machine Learning, Artificial Intelligence. More ›
Services
 
 Contact
 RSS
 Atom
 sitemap.xml
Network
 
 Built on Foundation
 Code on Github
 Created with ♥ by  using Jekyll based on Feeling Responsive.