library(tidyverse)
library(tidyquant) # contains coord_x_datetime for dates as coordinates
library(scales)
Load & Wrangle Zillow Data
Intro loops & functions
Library load
This data, from Zillow, comes to us in two files; one associated with selling price and one with listing price. We well join the datasets together so that each house is an observation and list price and sell price are variables.
0. load data
<- read_csv("https://euclid.nmu.edu/~joshthom/Teaching/DAT309/Week9/Metro_invt_fs_uc_sfrcondo_sm_month_sell.csv") |> janitor::clean_names()
sell
<- read_csv("http://euclid.nmu.edu/~joshthom/Teaching/DAT309/Week9/Metro_new_listings_uc_sfrcondo_sm_month_list.csv") |> janitor::clean_names()
list
print(sell, n = 5)
# A tibble: 928 × 84
region_id size_rank region_name region_type state_name x2018_03_31 x2018_04_30
<dbl> <dbl> <chr> <chr> <chr> <dbl> <dbl>
1 102001 0 United Sta… country <NA> 1421528 1500193
2 394913 1 New York, … msa NY 73707 80345
3 753899 2 Los Angele… msa CA 21998 23784
4 394463 3 Chicago, IL msa IL 38581 42253
5 394514 4 Dallas, TX msa TX 24043 25876
# ℹ 923 more rows
# ℹ 77 more variables: x2018_05_31 <dbl>, x2018_06_30 <dbl>, x2018_07_31 <dbl>,
# x2018_08_31 <dbl>, x2018_09_30 <dbl>, x2018_10_31 <dbl>, x2018_11_30 <dbl>,
# x2018_12_31 <dbl>, x2019_01_31 <dbl>, x2019_02_28 <dbl>, x2019_03_31 <dbl>,
# x2019_04_30 <dbl>, x2019_05_31 <dbl>, x2019_06_30 <dbl>, x2019_07_31 <dbl>,
# x2019_08_31 <dbl>, x2019_09_30 <dbl>, x2019_10_31 <dbl>, x2019_11_30 <dbl>,
# x2019_12_31 <dbl>, x2020_01_31 <dbl>, x2020_02_29 <dbl>, …
print(list, n = 5)
# A tibble: 923 × 84
region_id size_rank region_name region_type state_name x2018_03_31 x2018_04_30
<dbl> <dbl> <chr> <chr> <chr> <dbl> <dbl>
1 102001 0 United Sta… country <NA> 391816 453877
2 394913 1 New York, … msa NY 20008 23619
3 753899 2 Los Angele… msa CA 9187 10007
4 394463 3 Chicago, IL msa IL 12675 15359
5 394514 4 Dallas, TX msa TX 8606 10075
# ℹ 918 more rows
# ℹ 77 more variables: x2018_05_31 <dbl>, x2018_06_30 <dbl>, x2018_07_31 <dbl>,
# x2018_08_31 <dbl>, x2018_09_30 <dbl>, x2018_10_31 <dbl>, x2018_11_30 <dbl>,
# x2018_12_31 <dbl>, x2019_01_31 <dbl>, x2019_02_28 <dbl>, x2019_03_31 <dbl>,
# x2019_04_30 <dbl>, x2019_05_31 <dbl>, x2019_06_30 <dbl>, x2019_07_31 <dbl>,
# x2019_08_31 <dbl>, x2019_09_30 <dbl>, x2019_10_31 <dbl>, x2019_11_30 <dbl>,
# x2019_12_31 <dbl>, x2020_01_31 <dbl>, x2020_02_29 <dbl>, …
1. rename x’s with sell and list, respectively
First, we rename variables to make it easier to interpret the list & sell price variables. We have rename_with
use the str_replace
function to perform a renaming of many variables at once.
- The
~
syntax makesstr_replace
into a function whose argument. - The data is sent to the
.
spot.
- Thy syntax is str_replace(data, thing_to_replace, new_string)
<- sell |> rename_with(~str_replace(.,"x","sell_"))
sell <- list |> rename_with(~str_replace(.,"x","list_")) list
2. use names() and head() to note the 1st 10 variables
|> names() |> head(n=10) list
[1] "region_id" "size_rank" "region_name" "region_type"
[5] "state_name" "list_2018_03_31" "list_2018_04_30" "list_2018_05_31"
[9] "list_2018_06_30" "list_2018_07_31"
|> names() |> head() sell
[1] "region_id" "size_rank" "region_name" "region_type"
[5] "state_name" "sell_2018_03_31"
3. Joining 101:
There are several ways of joining two data sets together, some are illustrated below.
join two data sets with full_join
Joining two data frames using “left_join” or “right_join” requires a variable common to both data frames and keeps data. We use full_join
to bring in new (non-redundant) variables.
Learn more about joining here.
<- full_join(sell,list) list_sell
5. pivot to get plotable data
#
# selling price & listing price are variables
<- pivot_longer(list_sell,
zillow cols = starts_with("sell"),
names_to = "sell_date",
values_to = "sell_price") |>
pivot_longer(
cols = starts_with("list"),
names_to = "list_date",
values_to = "list_price")
<- zillow |>
zillow mutate(
list_date = as_datetime(str_remove(list_date,"list_")),
sell_date = as_datetime(str_remove(sell_date,"sell_"))
|>
) relocate(list_date,.before=1)
7. filter to get some reasonable sized data
# parameter "name" : vector of strings of state names: c("CO","MI")
# parameter "number" : the number of regions to pull from each states
<- function(name, number){
get_states
# empty dataset
<- {}
d for (i in 1:length(name)){
# print the name of the state for sanity
print(name[i])
# get all data for the given name
<- filter(zillow, state_name %in% name[i])
my_states
# grab a few region_names from each state
<- distinct(my_states,region_name) |>
some_names slice_sample(n = number)
<- filter(my_states,
some_regions %in% pull(some_names))
region_name # could also do some_names[[1]] to extract the vector from the tibble
<- bind_rows(d,some_regions)
d
}return(d)
}
<- get_states(c("CO","MI","AL","FL"),3) zil_sub
[1] "CO"
[1] "MI"
[1] "AL"
[1] "FL"
8. plot prices for each region
<- ggplot(zil_sub, aes(x = sell_date, y = sell_price,
p group = region_name,
color = region_name)) + geom_line()
print(p)
9. Plot list prices for each region
# option 1: plot alongside the sell plot
<- ggplot(zil_sub, aes(x = list_date, y = list_price,
q group = region_name,
color = region_name)) + geom_line()
print(q)
This works better:
<- mutate(zil_sub,
zz sell_date = as_date(sell_date),
list_date = as_date(list_date))
<- ggplot(zz, aes(x = sell_date, y = sell_price,
p group = region_name,
color = region_name)) +
# below, we draw solid lines for selling prices
geom_line(linetype = 1) +
geom_line(aes(x = list_date, y = list_price,
group = region_name,
color = region_name), linetype = 2)
This is even better:
# zoom to the interesting region
+ scale_x_date(date_breaks = "2 years") p
#p + coord_x_datetime(xlim = c("2018-01-01", "2023-12-31"))
# even better still
+ scale_x_date(date_breaks = "1 year", date_labels= "%Y") p
Zoom in on post 2018. Here we use coord_x_date
to showcase how to zoom without losing data. (The coord_
functions allow for zooming without removing data that might be used to compute a statistic)
+ scale_x_date(
p date_breaks = "1 year",
date_labels= "%Y") +
theme(axis.text.x = element_text(size = 5)) +
coord_x_date(xlim = c("2018-01-01","2024-01-01"))
Get better formatting on the y-axis
+ scale_x_date(
p date_breaks = "1 year",
date_labels= "%Y") +
theme(axis.text.x = element_text(size = 5)) +
scale_y_continuous(label = label_dollar(prefix = "$"))
10. zoom to the interesting region
+ scale_x_date(
p date_breaks = "1 year",
date_labels= "%Y",
limits =
c(as_date("2018-01-01"), as_date("2023-12-31"))) +
theme(axis.text.x = element_text(size = 5)) +
scale_y_continuous(label = label_dollar(prefix = "$")) +
labs(title = "Selling Price vs. Listing Price",
subtitle = "Dotted line is listing price and solid line is selling price")
Save data for future work
This saves the variable zil_sub
to the filename indicated below. Load it back up in R with load("zillow-data.Rdata")
.
save(zil_sub, file = "zillow-data.Rdata")