options(encoding="utf-8")
#loading data set with correct encoding for Icelandic letters
data <- read.csv2("data.csv", header = TRUE,sep = ";", dec = ",",na.strings = c("(null)", "NA"))
data$nuvirdi <- data$nuvirdi * 1000
Here we set variables to right type, filter our data and finally rename variables to have them in english
# Beging by removing unused variables
data <- data[-c(1,5,6,7,9,10,11,12,13,14,15,17,18,20,21,25,26,38,40,41,42,44,47,49,50, 52,53)]
#defining property type as factor
data$teg_eign <- as.factor(data$teg_eign)
#defining property type as factor
data$kdagur <- as.Date(data$kdagur,
sep = ".",
format = "%d.%m.%Y")
# fixing values in elivator variable and changing it to a binary variable
data$lyfta <- ifelse(data$lyfta == 0, 0, 1)
data$lyfta <- as.factor(data$lyfta )
# fixing values in top-floor variable and changing it to a binary variable
data$efstah <- ifelse(data$efstah == 1, 1, 0)
data$efstah <- as.factor(data$efstah)
# changing type of property variable to an factor variable and fixing levels
data$ibteg<-as.factor(data$ibteg)
data$ibteg <- factor(data$ibteg,levels=c(11,12),labels = c("Single family","Apartment building"))
# Removing property types that are not defined as homes
data <- data[!(data$teg_eign=="Vinnustofa"|
data$teg_eign=="Séreign"|
data$teg_eign=="Herbergi"|
data$teg_eign=="Gistihús"|
data$teg_eign=="Fjölbýlishús"|
data$teg_eign=="Hótelstarfsemi"|
data$teg_eign=="Ósamþykkt íbúð"),]
# Combining levels in the variable
data$teg_eign <- fct_collapse(data$teg_eign,
Íbúðareign = c("Íbúðareign","Íbúðarhús"))
#removing unused levels
data$teg_eign <- droplevels(data$teg_eign)
#only include subjects from the capital area
data <- subset(data, matssvaedi < 999)
data$matssvaedi<-as.factor(data$matssvaedi) # defining the location variable as a factor
data$matssvaedi<- fct_collapse(data$matssvaedi, # merging levels in the location variable
Reykjavíkurborg = c("11","20","25","31","70","72","75","80","85","90","91","100","110","120","130","140","150","160","161","170","180","181","200","210","220","270","280","281","282","283","284","290","999"),
Kópavogsbær = c("300","320","330","340","350","351"),
Garðabær= c("500","510","511","520","530","540","550","560","590","700"),
Hafnarfjarðarkaupstaður= c("600","620","630","640","650","660","670","680"),
Mosfellsbær = c("800","810","820","830","840","850","890"),
Seltjarnarnesbær = c("400"))
names(data) <- c('property_id', 'sales_date', 'price', 'property_type', 'year_built', 'Top_floor', 'Apartment_num', 'Elevator', 'Circumference', 'property_m2', 'netto_m2','Floor_num', 'parking_num', 'bathtub_num', 'Shower_num', 'Toilets_num', 'Kitchen_num', 'Room_num', 'LivingR_num', 'StorageR_num','ConstructionStage_level', 'Garage_area', 'Balcony_area', 'StorageR_area', 'Location','Building_type') # Change names to Englis
# Display the current levels of property type
print(levels(data$property_type))
## [1] "Einbýlishús" "Íbúðareign" "Parhús" "Raðhús"
# Rename levels
levels(data$property_type) <- c("single-family property", "Apartment", "two-family building", "town-house")
Next, we examine the ranges within our variables and assess for any anomalies, specifically searching for outliers and missing values.
summary(data)
## property_id sales_date price
## Min. :10000034 Min. :2012-01-02 Min. : 5900000
## 1st Qu.:10247911 1st Qu.:2013-11-19 1st Qu.: 25254750
## Median :10499449 Median :2015-06-19 Median : 33179000
## Mean :10498834 Mean :2015-04-26 Mean : 36914255
## 3rd Qu.:10748022 3rd Qu.:2016-10-07 3rd Qu.: 43977250
## Max. :10999990 Max. :2018-02-27 Max. :956146000
##
## property_type year_built Top_floor Apartment_num
## single-family property: 3176 Min. :1841 0:31737 Min. :1.000
## Apartment :28823 1st Qu.:1963 1: 3295 1st Qu.:1.000
## two-family building : 780 Median :1983 Median :1.000
## town-house : 2253 Mean :1981 Mean :1.005
## 3rd Qu.:2003 3rd Qu.:1.000
## Max. :2018 Max. :3.000
## NA's :10
## Elevator Circumference property_m2 netto_m2 Floor_num
## 0:26246 Min. : 6.10 Min. : 17.7 Min. : 14.90 Min. :1.0
## 1: 8786 1st Qu.: 47.10 1st Qu.: 74.9 1st Qu.: 64.20 1st Qu.:1.0
## Median : 64.00 Median : 95.5 Median : 82.40 Median :1.0
## Mean : 90.54 Mean :105.2 Mean : 90.54 Mean :1.2
## 3rd Qu.:104.20 3rd Qu.:122.0 3rd Qu.:105.70 3rd Qu.:1.0
## Max. :689.90 Max. :420.1 Max. :438.60 Max. :4.0
##
## parking_num bathtub_num Shower_num Toilets_num
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:1.000
## Median :0.0000 Median :1.0000 Median :1.0000 Median :1.000
## Mean :0.2054 Mean :0.7721 Mean :0.5569 Mean :1.197
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.000
## Max. :8.0000 Max. :3.0000 Max. :4.0000 Max. :5.000
##
## Kitchen_num Room_num LivingR_num StorageR_num
## Min. :0.000 Min. : 0.0 Min. : 0.00 Min. :0.0000
## 1st Qu.:1.000 1st Qu.: 2.0 1st Qu.: 1.00 1st Qu.:0.0000
## Median :1.000 Median : 2.0 Median : 1.00 Median :1.0000
## Mean :1.005 Mean : 2.5 Mean : 1.24 Mean :0.5946
## 3rd Qu.:1.000 3rd Qu.: 3.0 3rd Qu.: 1.00 3rd Qu.:1.0000
## Max. :4.000 Max. :24.0 Max. :14.00 Max. :8.0000
##
## ConstructionStage_level Garage_area Balcony_area StorageR_area
## Min. : 4.000 Min. :-70.000 Min. : -0.200 Min. : 0.00
## 1st Qu.:10.000 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.00
## Median :10.000 Median : 0.000 Median : 5.400 Median : 4.30
## Mean : 9.972 Mean : 7.795 Mean : 6.748 Mean : 5.36
## 3rd Qu.:10.000 3rd Qu.: 0.000 3rd Qu.: 9.400 3rd Qu.: 7.60
## Max. :10.000 Max. :115.400 Max. :269.800 Max. :238.60
## NA's :6 NA's :1 NA's :2
## Location Building_type
## Reykjavíkurborg :19879 Single family : 8552
## Kópavogsbær : 6089 Apartment building:26480
## Seltjarnarnesbær : 645
## Garðabær : 2473
## Hafnarfjarðarkaupstaður: 4773
## Mosfellsbær : 1173
##
We see unusuall values in the Garage_area and Balcony_area variables, we have negative values wich is not possible, so we change negative values to zero assuming that is the real value.
# Filter the variable to set negative values to zero
data$Garage_area[data$Garage_area < 0] <- 0
data$Balcony_area[data$Balcony_area < 0] <- 0
## Next remove NA.values
data <- na.omit(data)
library(ggplot2)
library(ggpubr)
options(scipen=10000)
# Create a box plot with ggplot2
plot1 <- ggplot(data = data, aes(x = price)) +
geom_histogram(binwidth = 1000, fill = "#69b3a2", color = "black") +
labs(title = "", x = "Price", y = "Frequency") +
scale_x_continuous(labels = comma)+
theme_minimal() +
theme(
panel.background = element_rect(fill = "#f0f0f0"), # Set the panel background color
plot.title = element_text(size = 15)
)
# Calculate the IQR
Q1 <- quantile(data$price, 0.25)
Q3 <- quantile(data$price, 0.75)
IQR <- Q3 - Q1
# Define lower and upper bounds for outliers
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
# Remove outliers
data <- data[data$price >= lower_bound & data$price <= upper_bound, ]
# Create a histogram with ggplot2
plot2 <- ggplot(data = data, aes(x = price)) +
geom_histogram(binwidth = 1000, fill = "#69b3a2", color = "black") +
labs(title = "", x = "Price", y = "Frequency") +
scale_x_continuous(labels = comma)+
theme_minimal() +
theme(
panel.background = element_rect(fill = "#f0f0f0"), # Set the panel background color
plot.title = element_text(size = 15)
)
ggarrange(plot1, plot2,
ncol = 2, nrow = 1)
We observe that the price distribution is right-skewed, and we have some extreme values. To address this, I’ll employ the Interquartile Range (IQR) method to identify and eliminate outliers. Following this adjustment, the price variable exhibits a distribution that is closer to normal.
now we look at Price by Property Type as we suggest that would be a factor effecting property price.
plot1 <- ggplot(data = data, aes(x = property_type, y = price)) +
geom_boxplot(fill = "lightblue") +
labs(title = "Box Plot of Price by Property Type", x = "Property Type", y = "Price") +
scale_y_continuous(labels = comma)+
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate labels by 45 degrees
plot2 <- ggplot(data = data, aes(x = Location, y = price)) +
geom_boxplot(fill = "lightblue") +
labs(title = "Box Plot of Price by Location", x = "Location", y = "Price") +
scale_y_continuous(labels = comma)+
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate labels by 45 degrees
ggarrange(plot1, plot2, ncol = 2, nrow = 1)
When analyzing the boxplot illustrating the relationship between property type and price, it becomes apparent that single-family properties exhibit the highest average price, followed by two-family buildings, townhouses, and finally apartments with the lowest average price. Notably, there are some apartments with prices significantly exceeding the average.
Upon examining the boxplot depicting the relationship between location type and price, it is clear that Seltjarnarnes and Garðarbær stand out as the most expensive neighborhoods on average. However, Seltjarnarnes exhibits a wider spread in prices. Following these two, Mosfellsbær is next, followed by Kópavogur. Finally, Reykjavík and Hafnarfjörður show similar average housing prices.
correlation_matrix <- cor(data[c(3,5,9,10,11,12,13,14,15,16,17,18,19,20,22,23,24)])
corrplot(correlation_matrix)
We see that property type has the highest correlation to Price and take a better look at the relationshiop. We also see Multicollinearity, espessially strong one in property_m2 and netto_2 or a correlation of 0.99, we decide to deal with that by remeoving the property_m2 variable. However given the main goal it prediction we dont bother with the less multicollinearity that still exists.
plot1<-ggplot(data, aes(x = netto_m2, y = price)) +
geom_point(color = "blue", size = 3, alpha = 0.6) +
labs(
title = "Scatterplot of Property Size vs. Price",
x = "Property Size (m²)",
y = "Price ($)"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 16, hjust = 0.5),
axis.title = element_text(size = 14),
axis.text = element_text(size = 12)
)
plot2<-ggplot(data, aes(x = log(netto_m2), y = price)) +
geom_point(color = "blue", size = 3, alpha = 0.6) +
labs(
title = "Scatterplot of Property Size vs. Price",
x = " log Property Size (m²)",
y = "Price ($)"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 16, hjust = 0.5),
axis.title = element_text(size = 14),
axis.text = element_text(size = 12)
)
plot3<-ggplot(data, aes(x = log(netto_m2), y = log(price))) +
geom_point(color = "blue", size = 3, alpha = 0.6) +
labs(
title = "Scatterplot of Property Size vs. Price",
x = " log Property Size (m²)",
y = "log Price ($)"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 16, hjust = 0.5),
axis.title = element_text(size = 14),
axis.text = element_text(size = 12)
)
ggarrange(plot1, plot2,plot3,
ncol = 2, nrow = 2)
When looking at the relationship between property size and price we see evidence of heteroscedasticity, that is as property size (m^2) increases, the variability in property prices also increases. To deal with this by taking the log of the variables to see if the releationship gets more linear. Firs we take the log af size, than both size and price. We see the best results in taking the log og both price and log.
Now, we employ two distinct methods to predict property prices and compare their performance. First we apply lasso regression, followed by the utilization of Random Forest.
## Begin by creating test and train data
set.seed(1)
train_ind<-sample(seq_len(nrow(data)),size = (nrow(data)* 0.7))##trainig set is 70% from original
train<-data[train_ind,]
test<-data[-train_ind,]
## First we create the matrixes for lasso
X <- model.matrix(pricelog~.-price, train)[,-c(1,2)] # Exclude the dependent variable
y <- train$pricelog # Dependent variable
# use cv.glmnet to perform cross-validation to find the optimal value of the regularization parameter (lambda)
lasso_model <- cv.glmnet(X, y, alpha = 1) # alpha = 1 specifies Lasso regression
best_lambda <- lasso_model$lambda.min## save the best lamda value in "best_lamda"
plot(lasso_model)
#find coefficients of best model
best_model <- glmnet(X, y, alpha = 1, lambda = best_lambda)
coef(best_model)
## 31 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 10.00067257262
## sales_date 0.00028998723
## property_typeApartment -0.11122425927
## property_typetwo-family building 0.00167158631
## property_typetown-house -0.02217067327
## year_built -0.00037562364
## Top_floor1 -0.01512885711
## Apartment_num 0.03650198521
## Elevator1 0.03067114837
## Circumference 0.00003856038
## netto_m2 -0.00245918708
## Floor_num -0.04416220845
## parking_num 0.09408244964
## bathtub_num -0.03634842144
## Shower_num 0.04095964670
## Toilets_num 0.03745483054
## Kitchen_num -0.00783707639
## Room_num -0.00770161185
## LivingR_num 0.02001452368
## StorageR_num -0.00184573459
## ConstructionStage_level -0.01856731315
## Garage_area 0.00371944436
## Balcony_area 0.00126826618
## StorageR_area 0.00323616183
## LocationKópavogsbær -0.00381214850
## LocationSeltjarnarnesbær 0.12724415997
## LocationGarðabær 0.04218426046
## LocationHafnarfjarðarkaupstaður -0.09729408149
## LocationMosfellsbær -0.05723923025
## Building_typeApartment building -0.02169549666
## lognetto_m2 0.83409154364
X_test <- model.matrix(pricelog~.-price, test)[,-c(1,2)]
y_pred <- predict(best_model, s = best_lambda, newx = X_test)
# Extract the actual response variable (price) from the test data
actual_values <- test$price
# Calculate RMSE
squared_errors <- (actual_values-exp(y_pred))^2
mean_squared_error <- mean(squared_errors)
rmse <- sqrt(mean_squared_error)
One of the advantages of using Random Forest is its ability to capture and handle non-linearity in the data without the need for explicit feature engineering or transformation. So we just use the data with out log transformations.
library(randomForest)
# Tune the mtry parameter using the tuneRF function
tune_results <- tuneRF(
x = train[, -c(3,26,26)], # Exclude the target variable
y = train$price, # Specify the target variable
ntree = 500, # Number of trees in the forest
stepFactor = 1.5, # Factor to adjust mtry
improve = 0.05, # Minimum improvement in OOB error
trace = TRUE, # Display progress
plot = TRUE # Generate a plot of OOB error vs. mtry
)
## mtry = 8 OOB error = 2.031413e+13
## Searching left ...
## mtry = 6 OOB error = 2.043281e+13
## -0.005842105 0.05
## Searching right ...
## mtry = 12 OOB error = 2.047742e+13
## -0.008038366 0.05
## choose the best mtry
rf_model <- randomForest(price ~ ., data = train[,-c(26,26)],mtry=8,importance=TRUE)
We see that the best option for mtry is 8, see we create our random forest with that. or mtry= 8
To compare the models, we will assess the Root Mean Squared Error (RMSE) for each method when predicting on unseen or the test data.
# Create a data.table to compare the values
comparison_table <- data.table(
"Method" = c("Lasso"," Random Forest"),
"RMSE" = c(rmse,rmse2)
)
# Apply formatting to the 'rmse' column to separate thousands and millions
comparison_table$RMSE <- format(comparison_table$RMSE, big.mark = ",", scientific = FALSE)
comparison_table %>%
kable(caption = "Comparison of Root Mean Squared Error (RMSE)")
Method | RMSE |
---|---|
Lasso | 5,826,373 |
Random Forest | 4,422,079 |
As we can see from the tabale the random forest model outperformed in forecasting property prices. The lasso model is on average 5.82 million from the true value and the random forest model 4.42 million. Considering Icelandic property prices, a deviation of plus or minus 4 million is not significant. Therefore, the model demonstrates considerable success in predicting prices.