# web scraping
library(netstat)
library(RSelenium)
library(rvest)
# core libraries
library(tidyverse)
library(tidyquant)
library(janitor)
library(stringr)
library(lubridate)
library(forcats)
# modeling
library(modeltime)
library(timetk)
library(tidymodels)
library(h2o)
library(recipes)
library(finetune)
library(tidytext)
library(lime)
# visualize
library(vip)
library(plotly)
library(patchwork)
library(ggrepel)
# eda
library(DataExplorer)
library(skimr)
<- today()
date options(scipen = 9999)
Broward County Rental Analysis & Price-Range Prediction
Summary
Are nightly rentals on Airbnb listed at the correct price? What features in the data can improve pricing to offer customers a tool to improve their rental rate? How heavily does text in the description help predict pricing?
Instead of randomly selecting rentals for price improvement, the model used in this analysis show a 3x improvement in some price ranges.
The classification analysis reviews Broward County, Florida. All of these questions are answered in the classification prediction.
Libraries
Data
<- read_rds("01_data_prep/calendar_filtered_dt.rds")
calendar_filtered_dt <- read_rds("01_data_prep/listings_filtered_dt.rds")
listings_filtered_dt
%>% head(10) listings_filtered_dt
# A tibble: 10 × 75
id listing_url scrap…¹ last_scr…² name descr…³ neigh…⁴ pictu…⁵ host_id
<dbl> <chr> <dbl> <date> <chr> <chr> <chr> <chr> <dbl>
1 4241389 https://www… 2.02e13 2022-03-25 Marr… "You a… <NA> https:… 2.20e7
2 4562966 https://www… 2.02e13 2022-03-25 Loca… "Stay … <NA> https:… 5.40e6
3 4595496 https://www… 2.02e13 2022-03-25 The … "This … <NA> https:… 2.38e7
4 4596401 https://www… 2.02e13 2022-03-25 Beau… "My co… I love… https:… 2.38e7
5 4767766 https://www… 2.02e13 2022-03-25 ★3br… "*Laud… The ho… https:… 1.45e7
6 4876339 https://www… 2.02e13 2022-03-24 20mi… "A sim… Hollyw… https:… 2.45e7
7 5285577 https://www… 2.02e13 2022-03-25 Key … "This … That t… https:… 2.30e7
8 5445232 https://www… 2.02e13 2022-03-25 A Cl… "Our 2… Our ne… https:… 1.16e7
9 5661115 https://www… 2.02e13 2022-03-24 SOLA… "Apart… urban … https:… 2.93e7
10 5724290 https://www… 2.02e13 2022-03-24 Self… "Entir… Our ne… https:… 2.56e7
# … with 66 more variables: host_url <chr>, host_name <chr>, host_since <date>,
# host_location <chr>, host_about <chr>, host_response_time <chr>,
# host_response_rate <chr>, host_acceptance_rate <chr>,
# host_is_superhost <lgl>, host_thumbnail_url <chr>, host_picture_url <chr>,
# host_neighbourhood <chr>, host_listings_count <dbl>,
# host_total_listings_count <dbl>, host_verifications <chr>,
# host_has_profile_pic <lgl>, host_identity_verified <lgl>, …
Data Preparation
The first step is to get all data in the correct format. More features will be added later in the recipe.
<- listings_filtered_dt %>%
listings_prepared_tbl
as_tibble() %>%
select(price, id, listing_url,
name, description, amenities, neighbourhood_cleansed,
latitude, longitude, accommodates,
bathrooms_text, bedrooms, beds, %>%
number_of_reviews, review_scores_rating)
# create price_rank and use ntile for equal proportion binning
mutate(price = price %>% str_remove_all("\\$|,") %>% as.double(),
price_rank = ntile(price, 10)) %>%
relocate(price_rank) %>%
# find min/max of each price_rank group
group_by(price_rank) %>%
mutate(
min_price_by_rank = min(price, na.rm = T),
max_price_by_rank = max(price, na.rm = T)
%>%
) ungroup() %>%
# new character column for priceRange... automated.
unite('priceRange', min_price_by_rank:max_price_by_rank, sep = "-", remove = T) %>%
relocate(priceRange) %>%
# bathrooms & bedrooms clean up
mutate(bathrooms = gsub('[ baths]','',bathrooms_text) %>% as.double() %>% round(., 0)) %>%
drop_na(bathrooms) %>%
select(-bathrooms_text) %>%
mutate(bedrooms = ifelse(is.na(bedrooms), 0, bedrooms))
%>% glimpse() listings_prepared_tbl
Rows: 5,359
Columns: 17
$ priceRange <chr> "130-153", "263-332", "183-215", "81-106", "153…
$ price_rank <int> 4, 8, 6, 2, 5, 4, 2, 5, 10, 2, 1, 9, 9, 3, 3, 9…
$ price <dbl> 132, 300, 189, 89, 166, 133, 92, 163, 590, 98, …
$ id <dbl> 4241389, 4562966, 4595496, 4596401, 4767766, 48…
$ listing_url <chr> "https://www.airbnb.com/rooms/4241389", "https:…
$ name <chr> "Marriott's BeachPlace Towers - Fort Lauderdale…
$ description <chr> "You are booking directly with a Marriott owner…
$ amenities <chr> "[\"Pack \\u2019n play/Travel crib\", \"Paid pa…
$ neighbourhood_cleansed <chr> "Fort Lauderdale", "Fort Lauderdale", "Hollywoo…
$ latitude <dbl> 26.12221, 26.12937, 25.98982, 26.01759, 26.1847…
$ longitude <dbl> -80.10521, -80.10438, -80.11786, -80.13178, -80…
$ accommodates <dbl> 4, 4, 3, 3, 12, 4, 3, 6, 2, 2, 2, 4, 4, 4, 2, 6…
$ bedrooms <dbl> 1, 0, 1, 1, 3, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1, 2,…
$ beds <dbl> 2, 2, 1, 1, 4, 2, 2, 3, 2, 2, 1, 2, 2, 4, 1, 4,…
$ number_of_reviews <dbl> 6, 44, 5, 2, 201, 104, 387, 440, 198, 293, 316,…
$ review_scores_rating <dbl> 4.83, 4.84, 5.00, 2.00, 4.69, 4.45, 4.61, 4.83,…
$ bathrooms <dbl> 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2,…
Visualize
Broward County is on the east coast of Florida, where Fort Lauderdale is located. It is just north of Miami.
What do we see?
- Most locations close to the beach are slightly lighter (light blue, light green) in color, or more expensive.
- Some very expensive (yellow) locations are dotted close to the coast, but also inland along the inlet.
- 1.7 (mean) number of bedrooms, accommodating 5.1 people.
- Most locations receive 4-/5-star reviews, and very few are given 1-star.
Text Analysis
I want to see the importance of text on the classification model. If it’s important, we will see on the variable importance plot.
First, let’s tokenize the description. We need to remove the stopwords like “a,” “the,” “with,” etc.
<- listings_prepared_tbl %>%
airbnb_text mutate(priceRange = parse_number(priceRange)) %>%
unnest_tokens(word, description) %>%
anti_join(get_stopwords())
%>% count(word, sort = T) airbnb_text
# A tibble: 10,810 × 2
word n
<chr> <int>
1 br 41621
2 b 10611
3 beach 6755
4 space 4888
5 2 3672
6 bedroom 3415
7 located 3091
8 kitchen 3052
9 home 3024
10 room 2906
# … with 10,800 more rows
Word Frequency
Let’s find the frequency of the top 100 words in each priceRange.
# A tibble: 20 × 5
word priceRange n price_total proportion
<chr> <dbl> <int> <int> <dbl>
1 10 18 88 44136 0.000000184
2 10 81 107 42365 0.000000234
3 10 106 102 43943 0.000000215
4 10 130 100 45190 0.000000205
5 10 153 84 46766 0.000000166
6 10 183 110 46257 0.000000220
7 10 215 98 47324 0.000000192
8 10 263 97 48164 0.000000186
9 10 333 86 49175 0.000000162
10 10 500 85 48302 0.000000163
11 access 18 270 44136 0.000000566
12 access 81 320 42365 0.000000699
13 access 106 336 43943 0.000000707
14 access 130 317 45190 0.000000649
15 access 153 273 46766 0.000000540
16 access 183 321 46257 0.000000642
17 access 215 299 47324 0.000000584
18 access 263 243 48164 0.000000467
19 access 333 274 49175 0.000000515
20 access 500 229 48302 0.000000439
Text Modeling
Find word frequency in rentals that are increasing with price, and those that are decreasing.
Let’s look at the frequency of words in higher-priced rentals vs. lower-priced
Words like ocean, views, pool, resort, family, house, and king increase with price.
%>% arrange(-estimate) word_mods
# A tibble: 100 × 7
word data term estimate std.error statistic p.value
<chr> <list> <chr> <dbl> <dbl> <dbl> <dbl>
1 bedrooms <tibble [10 × 4]> priceRange 0.00374 0.000213 17.5 8.93e-67
2 views <tibble [10 × 4]> priceRange 0.00351 0.000229 15.3 7.99e-51
3 floor <tibble [10 × 4]> priceRange 0.00290 0.000236 12.3 8.72e-33
4 resort <tibble [10 × 4]> priceRange 0.00226 0.000175 12.9 4.20e-36
5 large <tibble [10 × 4]> priceRange 0.00197 0.000215 9.17 4.06e-18
6 king <tibble [10 × 4]> priceRange 0.00195 0.000209 9.35 7.34e-19
7 view <tibble [10 × 4]> priceRange 0.00177 0.000239 7.44 7.62e-12
8 pool <tibble [10 × 4]> priceRange 0.00175 0.000129 13.6 5.58e-40
9 family <tibble [10 × 4]> priceRange 0.00174 0.000190 9.15 4.55e-18
10 ocean <tibble [10 × 4]> priceRange 0.00172 0.000196 8.76 1.64e-16
# … with 90 more rows
Words like studio, restaurants, wifi, coffee, parking, shopping, mall, minutes, and mentioning the airport decrease with price.
%>% arrange(estimate) word_mods
# A tibble: 100 × 7
word data term estimate std.error statistic p.value
<chr> <list> <chr> <dbl> <dbl> <dbl> <dbl>
1 studio <tibble [10 × 4]> priceRange -0.00768 0.000337 -22.8 5.57e-113
2 mall <tibble [10 × 4]> priceRange -0.00308 0.000324 -9.52 1.51e- 19
3 airport <tibble [10 × 4]> priceRange -0.00272 0.000257 -10.6 4.32e- 24
4 miles <tibble [10 × 4]> priceRange -0.00269 0.000294 -9.16 4.51e- 18
5 coffee <tibble [10 × 4]> priceRange -0.00261 0.000271 -9.64 4.71e- 20
6 free <tibble [10 × 4]> priceRange -0.00232 0.000213 -10.9 1.14e- 25
7 close <tibble [10 × 4]> priceRange -0.00219 0.000280 -7.83 3.90e- 13
8 parking <tibble [10 × 4]> priceRange -0.00201 0.000172 -11.7 7.30e- 30
9 minutes <tibble [10 × 4]> priceRange -0.00201 0.000189 -10.6 2.57e- 24
10 las <tibble [10 × 4]> priceRange -0.00178 0.000257 -6.94 2.82e- 10
# … with 90 more rows
Visualize with Volcano plot
The chart below compares the frequency of the higher-priced rentals (right side of vertical line) to the lower-priced rentals.
%>%
word_mods mutate(p.value = log10(p.value)) %>%
ggplot(aes(estimate, p.value)) +
geom_vline(xintercept = 0, lty = 2, alpha = 0.7, color = "gray50") +
geom_point(color = "midnightblue", alpha = 0.8, size = 2.5) +
geom_text_repel(aes(label = word), max.overlaps = 40)
It’s not such a surprise that lower-priced rentals feel the need to describe their location to amenities like restaurants, shopping and mall, as well as included, or free, items like coffee and wifi.
Filter upper/lower p.value
<- word_mods %>%
higher_words filter(p.value < 0.05) %>%
slice_max(estimate, n = 12) %>%
pull(word)
higher_words
[1] "bedrooms" "views" "floor" "resort" "large" "king"
[7] "view" "pool" "family" "ocean" "condo" "dining"
<- word_mods %>%
lower_words filter(p.value < 0.05) %>%
slice_max(-estimate, n = 12) %>%
pull(word)
lower_words
[1] "studio" "mall" "airport" "miles" "coffee" "free"
[7] "close" "parking" "minutes" "las" "olas" "shopping"
Visualize
The charts below show the top 12 words associated with price increase & decrease.
- As the price increases, higher frequency words trend upward on higher-priced rentals.
- On lower-priced rentals, the curve starts high on words like “airport” and “shopping,” and then decrease as the price increases.
# higher words
%>%
word_frequency filter(word %in% higher_words) %>%
ggplot(aes(priceRange, proportion, color = word)) +
geom_line(size = 1.5, alpha = 0.7, show.legend = F) +
facet_wrap(vars(word), scales = "free_y") +
scale_x_continuous(labels = scales::dollar) +
scale_y_continuous(labels = scales::percent, limits = c(0, NA)) +
labs(x = "rental price", y = "proportion of total words used for rentals in that price")
# lower words
%>%
word_frequency filter(word %in% lower_words) %>%
ggplot(aes(priceRange, proportion, color = word)) +
geom_line(size = 1.5, alpha = 0.7, show.legend = F) +
facet_wrap(vars(word), scales = "free_y") +
scale_x_continuous(labels = scales::dollar) +
scale_y_continuous(labels = scales::percent, limits = c(0, NA)) +
labs(x = "rental price", y = "proportion of total words used for rentals in that price")
Maybe this text analysis helps the model. Let’s find out.
Modeling
Train/Test Split
Because there isn’t so much data, we perform the initial split 90% training, and the rest is testing. Including more data might create more noise, but then we could use an 80/20 split, which is what I normally prefer. I can always change it later.
set.seed(123)
<- listings_prepared_tbl %>%
split select(-listing_url, -name, -price, -price_rank) %>%
mutate(description = str_to_lower(description)) %>%
initial_split(strata = priceRange, prop = 90/100)
<- training(split)
train_tbl <- testing(split)
test_tbl <- metric_set(accuracy, roc_auc, mn_log_loss)
metrics
# resamples
set.seed(234)
<- vfold_cv(train_tbl, v = 5, strata = priceRange)
v_folds v_folds
# 5-fold cross-validation using stratification
# A tibble: 5 × 2
splits id
<list> <chr>
1 <split [3852/966]> Fold1
2 <split [3854/964]> Fold2
3 <split [3854/964]> Fold3
4 <split [3856/962]> Fold4
5 <split [3856/962]> Fold5
Recipe + Regex Pattern
ML models can be finicky, and recipes are key to preparing the data. I create a quick regex pattern for those 24 words of the high-/low-priced rentals.
<- glue::glue_collapse(higher_words, sep = "|")
higher_pat <- glue::glue_collapse(lower_words, sep = "|")
lower_pat
<- recipe(priceRange ~ ., data = train_tbl) %>%
recipe_spec update_role(id, new_role = "id") %>%
# create a new indicator variable based on pattern using regex
step_regex(description, pattern = higher_pat, result = "high_price_words") %>%
step_regex(description, pattern = lower_pat, result = "low_price_words") %>%
step_rm(description) %>%
# remove zero-value/missing data in beds
step_zv(beds) %>%
step_novel(neighbourhood_cleansed) %>%
step_dummy(all_nominal_predictors(), one_hot = T) %>%
step_nzv(all_predictors())
recipe_spec
Recipe
Inputs:
role #variables
id 1
outcome 1
predictor 11
Operations:
Regular expression dummy variable using "bedrooms|views|floor|resort|large|king|view|po...
Regular expression dummy variable using "studio|mall|airport|miles|coffee|free|close|pa...
Variables removed description
Zero variance filter on beds
Novel factor level assignment for neighbourhood_cleansed
Dummy variables from all_nominal_predictors()
Sparse, unbalanced variable filter on all_predictors()
XGBoost Model
Boosted Tree Model Specification (classification)
Main Arguments:
mtry = tune()
trees = 1000
min_n = tune()
tree_depth = tune()
learn_rate = tune()
sample_size = tune()
Computational engine: xgboost
Custom Grid
I want to create a custom grid so I can have more control and have more complex modeling
# A tibble: 20 × 5
tree_depth min_n mtry sample_size learn_rate
<int> <int> <int> <dbl> <dbl>
1 7 33 8 0.768 0.0845
2 10 33 7 0.928 0.0784
3 5 21 6 0.626 0.0868
4 9 31 8 0.728 0.0162
5 8 35 5 0.666 0.0937
6 6 21 5 0.907 0.0105
7 6 27 6 0.982 0.0729
8 7 33 8 0.936 0.0102
9 7 15 5 0.559 0.0182
10 6 35 9 0.784 0.0347
11 9 39 9 0.737 0.0582
12 8 17 8 0.596 0.0818
13 9 21 7 0.601 0.0136
14 7 15 7 0.763 0.0197
15 6 12 10 0.800 0.0569
16 9 19 9 0.589 0.0138
17 10 14 5 0.829 0.0140
18 8 37 10 0.664 0.0202
19 5 11 5 0.514 0.0136
20 10 38 9 0.962 0.0150
Tune parameters with Racing methods
5 vfolds (or resamples) * 20 possible parameters = 100 xgb models
Won’t train ALL 100, as it will throw out some
ℹ Racing will minimize the mn_log_loss metric.
ℹ Resamples are analyzed in a random order.
ℹ Fold5: 14 eliminated; 6 candidates remain.
ℹ Fold4: All but one parameter combination were eliminated.
# Tuning results
# 5-fold cross-validation using stratification
# A tibble: 5 × 5
splits id .order .metrics .notes
<list> <chr> <int> <list> <list>
1 <split [3852/966]> Fold1 3 <tibble [20 × 9]> <tibble [20 × 3]>
2 <split [3854/964]> Fold3 1 <tibble [20 × 9]> <tibble [20 × 3]>
3 <split [3856/962]> Fold5 2 <tibble [20 × 9]> <tibble [20 × 3]>
4 <split [3856/962]> Fold4 4 <tibble [6 × 9]> <tibble [6 × 3]>
5 <split [3854/964]> Fold2 5 <tibble [1 × 9]> <tibble [1 × 3]>
There were issues with some computations:
- Warning(s) x20: There are new levels in a factor: ["Air conditioning", "Carbon mo... - Warning(s) x20: There are new levels in a factor: ["Air conditioning", "Carbon mo... - Warning(s) x1: There are new levels in a factor: ["Air conditioning", "Carbon mo... - Warning(s) x6: There are new levels in a factor: ["Air conditioning", "Carbon mo... - Warning(s) x20: There are new levels in a factor: ["Air conditioning", "Carbon mo...
Run `show_notes(.Last.tune.result)` for more information.
Visualize Race Results
plot_race(xgb_word_rs)
show_best(xgb_word_rs)
# A tibble: 1 × 11
mtry min_n tree_depth learn_rate sample…¹ .metric .esti…² mean n std_err
<int> <int> <int> <dbl> <dbl> <chr> <chr> <dbl> <int> <dbl>
1 5 14 10 0.0140 0.829 mn_log… multic… 1.87 5 0.00759
# … with 1 more variable: .config <chr>, and abbreviated variable names
# ¹sample_size, ².estimator
## Finalize on Best
<- xgb_word_wflw %>%
xgb_last finalize_workflow(select_best(xgb_word_rs, "mn_log_loss")) %>%
last_fit(split)
! train/test split: preprocessor 1/1, model 1/1 (predictions): There are new levels in a factor: ["Pack \u2019n play/Travel crib", "Pai...
xgb_last
# Resampling results
# Manual resampling
# A tibble: 1 × 6
splits id .metrics .notes .predictions .workflow
<list> <chr> <list> <list> <list> <list>
1 <split [4818/541]> train/test split <tibble> <tibble> <tibble> <workflow>
There were issues with some computations:
- Warning(s) x1: There are new levels in a factor: ["Pack \u2019n play/Travel crib...
Run `show_notes(.Last.tune.result)` for more information.
Results
Variable Importance
Lat+Long are most important features, then number of people it can accommodate. The model text analysis was used, but wasn’t as important as features on reviews.
extract_workflow(xgb_last) %>%
extract_fit_parsnip() %>%
vip(geom = "point", num_features = 15)
Logloss
# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 mn_log_loss multiclass 1.85
Confusion Matrix
More price adjustment suggestions are picked up in the very expensive and very cheap price range.
ROC Curve
Receiver Operator Characteristics
The model identifies the most expensive & cheapest locations (0-99 and 373+) easier than mid-priced. This means the mid-priced rentals are probably priced the best.
AUC
Area Under the Curve
1 is a perfect model, and 0.50 doesn’t add any intelligence. Our predictions are okay. Maybe another model would be better, or creating an ensemble, or another round of tuning.
# A tibble: 10 × 4
.pred_class .metric .estimator .estimate
<fct> <chr> <chr> <dbl>
1 333-500 roc_auc hand_till 0.751
2 263-332 roc_auc hand_till 0.713
3 183-215 roc_auc hand_till 0.698
4 81-106 roc_auc hand_till 0.667
5 215-263 roc_auc hand_till 0.657
6 153-183 roc_auc hand_till 0.656
7 500-9000 roc_auc hand_till 0.649
8 130-153 roc_auc hand_till 0.621
9 106-130 roc_auc hand_till 0.609
10 18-81 roc_auc hand_till 0.556
Gain
Using the model shows we are gaining 89% of all listings where suggesting to adjust price is recommended in the first 30% of listings in the $500+ price group.
Lift
The lift quantifies the gain. Using the example above:
89% / 30% = 2.96x improvement
Using the model there is almost a 3x improvement to help rentals improve their list price based on the features in the model
Precision Recall
The better results are closer to 1:1 (the upper-right quadrant)
False Negatives are typically more important. Recall indicates susceptibility to FN’s (lower recall, more susceptible).
… in order words, we want to accurately predict the rentals that should adjust their price (lower FN’s) at the expense of over-predicting rentals that should not (False Positives).
The precision vs. recall curve shows us which models will give up fewer FP’s as we optimize the threshold for FN’s.