Clean Get data

Get data

options(encoding="utf-8")

#loading data set with correct encoding for Icelandic letters
data <- read.csv2("data.csv", header = TRUE,sep = ";", dec = ",",na.strings = c("(null)", "NA"))

data$nuvirdi <- data$nuvirdi * 1000

clean data

Here we set variables to right type, filter our data and finally rename variables to have them in english

# Beging by removing unused variables
data <- data[-c(1,5,6,7,9,10,11,12,13,14,15,17,18,20,21,25,26,38,40,41,42,44,47,49,50, 52,53)]

#defining property type as factor
data$teg_eign <- as.factor(data$teg_eign)

#defining property type as factor
data$kdagur <- as.Date(data$kdagur, 
                       sep = ".", 
                       format = "%d.%m.%Y")

# fixing values in elivator variable and changing it to a binary variable
data$lyfta <- ifelse(data$lyfta == 0, 0, 1)
data$lyfta <- as.factor(data$lyfta )

# fixing values in top-floor variable and changing it to a binary variable

data$efstah <- ifelse(data$efstah == 1, 1, 0)
data$efstah <- as.factor(data$efstah)

# changing type of property variable to an factor variable and fixing levels

data$ibteg<-as.factor(data$ibteg)

data$ibteg <- factor(data$ibteg,levels=c(11,12),labels = c("Single family","Apartment building"))

# Removing property types that are not defined as homes
data <- data[!(data$teg_eign=="Vinnustofa"|
                data$teg_eign=="Séreign"|
                data$teg_eign=="Herbergi"|
                data$teg_eign=="Gistihús"|
                data$teg_eign=="Fjölbýlishús"|
                data$teg_eign=="Hótelstarfsemi"|
                data$teg_eign=="Ósamþykkt íbúð"),]



# Combining levels in the variable
data$teg_eign <- fct_collapse(data$teg_eign, 
                               Íbúðareign = c("Íbúðareign","Íbúðarhús"))
#removing unused levels
data$teg_eign <- droplevels(data$teg_eign) 

#only include subjects from the capital area
data <- subset(data, matssvaedi < 999) 

data$matssvaedi<-as.factor(data$matssvaedi) # defining the location variable as a factor

data$matssvaedi<- fct_collapse(data$matssvaedi,  # merging levels in the location variable
                Reykjavíkurborg = c("11","20","25","31","70","72","75","80","85","90","91","100","110","120","130","140","150","160","161","170","180","181","200","210","220","270","280","281","282","283","284","290","999"), 
                Kópavogsbær = c("300","320","330","340","350","351"), 
                Garðabær= c("500","510","511","520","530","540","550","560","590","700"),
                Hafnarfjarðarkaupstaður= c("600","620","630","640","650","660","670","680"),
                Mosfellsbær = c("800","810","820","830","840","850","890"),
                Seltjarnarnesbær = c("400"))

names(data) <- c('property_id', 'sales_date', 'price', 'property_type', 'year_built', 'Top_floor', 'Apartment_num', 'Elevator', 'Circumference', 'property_m2', 'netto_m2','Floor_num', 'parking_num', 'bathtub_num', 'Shower_num', 'Toilets_num', 'Kitchen_num', 'Room_num', 'LivingR_num', 'StorageR_num','ConstructionStage_level', 'Garage_area', 'Balcony_area', 'StorageR_area', 'Location','Building_type') # Change names to Englis


# Display the current levels of property type
print(levels(data$property_type))
## [1] "Einbýlishús" "Íbúðareign"  "Parhús"      "Raðhús"
# Rename levels
levels(data$property_type) <- c("single-family property", "Apartment", "two-family building", "town-house")

Next, we examine the ranges within our variables and assess for any anomalies, specifically searching for outliers and missing values.

summary(data)
##   property_id         sales_date             price          
##  Min.   :10000034   Min.   :2012-01-02   Min.   :  5900000  
##  1st Qu.:10247911   1st Qu.:2013-11-19   1st Qu.: 25254750  
##  Median :10499449   Median :2015-06-19   Median : 33179000  
##  Mean   :10498834   Mean   :2015-04-26   Mean   : 36914255  
##  3rd Qu.:10748022   3rd Qu.:2016-10-07   3rd Qu.: 43977250  
##  Max.   :10999990   Max.   :2018-02-27   Max.   :956146000  
##                                                             
##                 property_type     year_built   Top_floor Apartment_num  
##  single-family property: 3176   Min.   :1841   0:31737   Min.   :1.000  
##  Apartment             :28823   1st Qu.:1963   1: 3295   1st Qu.:1.000  
##  two-family building   :  780   Median :1983             Median :1.000  
##  town-house            : 2253   Mean   :1981             Mean   :1.005  
##                                 3rd Qu.:2003             3rd Qu.:1.000  
##                                 Max.   :2018             Max.   :3.000  
##                                 NA's   :10                              
##  Elevator  Circumference     property_m2       netto_m2        Floor_num  
##  0:26246   Min.   :  6.10   Min.   : 17.7   Min.   : 14.90   Min.   :1.0  
##  1: 8786   1st Qu.: 47.10   1st Qu.: 74.9   1st Qu.: 64.20   1st Qu.:1.0  
##            Median : 64.00   Median : 95.5   Median : 82.40   Median :1.0  
##            Mean   : 90.54   Mean   :105.2   Mean   : 90.54   Mean   :1.2  
##            3rd Qu.:104.20   3rd Qu.:122.0   3rd Qu.:105.70   3rd Qu.:1.0  
##            Max.   :689.90   Max.   :420.1   Max.   :438.60   Max.   :4.0  
##                                                                           
##   parking_num      bathtub_num       Shower_num      Toilets_num   
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:0.0000   1st Qu.:1.000  
##  Median :0.0000   Median :1.0000   Median :1.0000   Median :1.000  
##  Mean   :0.2054   Mean   :0.7721   Mean   :0.5569   Mean   :1.197  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.000  
##  Max.   :8.0000   Max.   :3.0000   Max.   :4.0000   Max.   :5.000  
##                                                                    
##   Kitchen_num       Room_num     LivingR_num     StorageR_num   
##  Min.   :0.000   Min.   : 0.0   Min.   : 0.00   Min.   :0.0000  
##  1st Qu.:1.000   1st Qu.: 2.0   1st Qu.: 1.00   1st Qu.:0.0000  
##  Median :1.000   Median : 2.0   Median : 1.00   Median :1.0000  
##  Mean   :1.005   Mean   : 2.5   Mean   : 1.24   Mean   :0.5946  
##  3rd Qu.:1.000   3rd Qu.: 3.0   3rd Qu.: 1.00   3rd Qu.:1.0000  
##  Max.   :4.000   Max.   :24.0   Max.   :14.00   Max.   :8.0000  
##                                                                 
##  ConstructionStage_level  Garage_area       Balcony_area     StorageR_area   
##  Min.   : 4.000          Min.   :-70.000   Min.   : -0.200   Min.   :  0.00  
##  1st Qu.:10.000          1st Qu.:  0.000   1st Qu.:  0.000   1st Qu.:  0.00  
##  Median :10.000          Median :  0.000   Median :  5.400   Median :  4.30  
##  Mean   : 9.972          Mean   :  7.795   Mean   :  6.748   Mean   :  5.36  
##  3rd Qu.:10.000          3rd Qu.:  0.000   3rd Qu.:  9.400   3rd Qu.:  7.60  
##  Max.   :10.000          Max.   :115.400   Max.   :269.800   Max.   :238.60  
##                          NA's   :6         NA's   :1         NA's   :2       
##                     Location                Building_type  
##  Reykjavíkurborg        :19879   Single family     : 8552  
##  Kópavogsbær            : 6089   Apartment building:26480  
##  Seltjarnarnesbær       :  645                             
##  Garðabær               : 2473                             
##  Hafnarfjarðarkaupstaður: 4773                             
##  Mosfellsbær            : 1173                             
## 

We see unusuall values in the Garage_area and Balcony_area variables, we have negative values wich is not possible, so we change negative values to zero assuming that is the real value.

# Filter the variable to set negative values to zero
data$Garage_area[data$Garage_area < 0] <- 0
data$Balcony_area[data$Balcony_area < 0] <- 0

## Next remove NA.values

data <- na.omit(data)

Descriptive plots

Distribution of price variable

library(ggplot2)
library(ggpubr)
options(scipen=10000)
# Create a box plot with ggplot2
plot1 <- ggplot(data = data, aes(x = price)) +
  geom_histogram(binwidth = 1000, fill = "#69b3a2", color = "black") +
  labs(title = "", x = "Price", y = "Frequency") +
  scale_x_continuous(labels = comma)+
  theme_minimal() +
  theme(
    panel.background = element_rect(fill = "#f0f0f0"),  # Set the panel background color
    plot.title = element_text(size = 15)
  ) 

# Calculate the IQR
Q1 <- quantile(data$price, 0.25)
Q3 <- quantile(data$price, 0.75)
IQR <- Q3 - Q1

# Define lower and upper bounds for outliers
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR

# Remove outliers
data <- data[data$price >= lower_bound & data$price <= upper_bound, ]

# Create a histogram with ggplot2
plot2 <- ggplot(data = data, aes(x = price)) +
  geom_histogram(binwidth = 1000, fill = "#69b3a2", color = "black") +
  labs(title = "", x = "Price", y = "Frequency") +
  scale_x_continuous(labels = comma)+
  theme_minimal() +
  theme(
    panel.background = element_rect(fill = "#f0f0f0"),  # Set the panel background color
    plot.title = element_text(size = 15)
  ) 

ggarrange(plot1, plot2,
          ncol = 2, nrow = 1)

We observe that the price distribution is right-skewed, and we have some extreme values. To address this, I’ll employ the Interquartile Range (IQR) method to identify and eliminate outliers. Following this adjustment, the price variable exhibits a distribution that is closer to normal.

Price by Property Type and location,

now we look at Price by Property Type as we suggest that would be a factor effecting property price.

plot1 <- ggplot(data = data, aes(x = property_type, y = price)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Box Plot of Price by Property Type", x = "Property Type", y = "Price") +
  scale_y_continuous(labels = comma)+
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate labels by 45 degrees

plot2 <- ggplot(data = data, aes(x = Location, y = price)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Box Plot of Price by Location", x = "Location", y = "Price") +
  scale_y_continuous(labels = comma)+
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate labels by 45 degrees

ggarrange(plot1, plot2, ncol = 2, nrow = 1)

When analyzing the boxplot illustrating the relationship between property type and price, it becomes apparent that single-family properties exhibit the highest average price, followed by two-family buildings, townhouses, and finally apartments with the lowest average price. Notably, there are some apartments with prices significantly exceeding the average.

Upon examining the boxplot depicting the relationship between location type and price, it is clear that Seltjarnarnes and Garðarbær stand out as the most expensive neighborhoods on average. However, Seltjarnarnes exhibits a wider spread in prices. Following these two, Mosfellsbær is next, followed by Kópavogur. Finally, Reykjavík and Hafnarfjörður show similar average housing prices.

correlation matrix

correlation_matrix <- cor(data[c(3,5,9,10,11,12,13,14,15,16,17,18,19,20,22,23,24)])
corrplot(correlation_matrix)

We see that property type has the highest correlation to Price and take a better look at the relationshiop. We also see Multicollinearity, espessially strong one in property_m2 and netto_2 or a correlation of 0.99, we decide to deal with that by remeoving the property_m2 variable. However given the main goal it prediction we dont bother with the less multicollinearity that still exists.

plot property_m2 against price

plot1<-ggplot(data, aes(x = netto_m2, y = price)) +
  geom_point(color = "blue", size = 3, alpha = 0.6) +
  labs(
    title = "Scatterplot of Property Size vs. Price",
    x = "Property Size (m²)",
    y = "Price ($)"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, hjust = 0.5),
    axis.title = element_text(size = 14),
    axis.text = element_text(size = 12)
  )


plot2<-ggplot(data, aes(x = log(netto_m2), y = price)) +
  geom_point(color = "blue", size = 3, alpha = 0.6) +
  labs(
    title = "Scatterplot of Property Size vs. Price",
    x = " log Property Size (m²)",
    y = "Price ($)"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, hjust = 0.5),
    axis.title = element_text(size = 14),
    axis.text = element_text(size = 12)
  )

plot3<-ggplot(data, aes(x = log(netto_m2), y = log(price))) +
  geom_point(color = "blue", size = 3, alpha = 0.6) +
  labs(
    title = "Scatterplot of Property Size vs. Price",
    x = " log Property Size (m²)",
    y = "log Price ($)"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, hjust = 0.5),
    axis.title = element_text(size = 14),
    axis.text = element_text(size = 12)
  )


ggarrange(plot1, plot2,plot3,
          ncol = 2, nrow = 2)

When looking at the relationship between property size and price we see evidence of heteroscedasticity, that is as property size (m^2) increases, the variability in property prices also increases. To deal with this by taking the log of the variables to see if the releationship gets more linear. Firs we take the log af size, than both size and price. We see the best results in taking the log og both price and log.

Predicting Price

Now, we employ two distinct methods to predict property prices and compare their performance. First we apply lasso regression, followed by the utilization of Random Forest.

## Begin by creating test and train data

set.seed(1)

train_ind<-sample(seq_len(nrow(data)),size = (nrow(data)* 0.7))##trainig set is 70% from original
train<-data[train_ind,]
test<-data[-train_ind,]

Lasso regression

## First we create the matrixes for lasso
X <- model.matrix(pricelog~.-price, train)[,-c(1,2)]  # Exclude the dependent variable
y <- train$pricelog  # Dependent variable



# use cv.glmnet to perform cross-validation to find the optimal value of the regularization parameter (lambda)
lasso_model <- cv.glmnet(X, y, alpha = 1)  # alpha = 1 specifies Lasso regression


best_lambda <- lasso_model$lambda.min## save the best lamda value in "best_lamda"

plot(lasso_model)

#find coefficients of best model
best_model <- glmnet(X, y, alpha = 1, lambda = best_lambda)
coef(best_model)
## 31 x 1 sparse Matrix of class "dgCMatrix"
##                                              s0
## (Intercept)                      10.00067257262
## sales_date                        0.00028998723
## property_typeApartment           -0.11122425927
## property_typetwo-family building  0.00167158631
## property_typetown-house          -0.02217067327
## year_built                       -0.00037562364
## Top_floor1                       -0.01512885711
## Apartment_num                     0.03650198521
## Elevator1                         0.03067114837
## Circumference                     0.00003856038
## netto_m2                         -0.00245918708
## Floor_num                        -0.04416220845
## parking_num                       0.09408244964
## bathtub_num                      -0.03634842144
## Shower_num                        0.04095964670
## Toilets_num                       0.03745483054
## Kitchen_num                      -0.00783707639
## Room_num                         -0.00770161185
## LivingR_num                       0.02001452368
## StorageR_num                     -0.00184573459
## ConstructionStage_level          -0.01856731315
## Garage_area                       0.00371944436
## Balcony_area                      0.00126826618
## StorageR_area                     0.00323616183
## LocationKópavogsbær              -0.00381214850
## LocationSeltjarnarnesbær          0.12724415997
## LocationGarðabær                  0.04218426046
## LocationHafnarfjarðarkaupstaður  -0.09729408149
## LocationMosfellsbær              -0.05723923025
## Building_typeApartment building  -0.02169549666
## lognetto_m2                       0.83409154364
X_test <- model.matrix(pricelog~.-price, test)[,-c(1,2)]
y_pred <- predict(best_model, s = best_lambda, newx = X_test)


# Extract the actual response variable (price) from the test data
actual_values <- test$price

# Calculate RMSE

squared_errors <- (actual_values-exp(y_pred))^2

mean_squared_error <- mean(squared_errors)

rmse <- sqrt(mean_squared_error)

Random forest

One of the advantages of using Random Forest is its ability to capture and handle non-linearity in the data without the need for explicit feature engineering or transformation. So we just use the data with out log transformations.

library(randomForest)




# Tune the mtry parameter using the tuneRF function
tune_results <- tuneRF(
  x = train[, -c(3,26,26)],  # Exclude the target variable
  y = train$price,  # Specify the target variable
  ntree = 500,  # Number of trees in the forest
  stepFactor = 1.5,  # Factor to adjust mtry
  improve = 0.05,  # Minimum improvement in OOB error
  trace = TRUE,  # Display progress
  plot = TRUE  # Generate a plot of OOB error vs. mtry
)
## mtry = 8  OOB error = 2.031413e+13 
## Searching left ...
## mtry = 6     OOB error = 2.043281e+13 
## -0.005842105 0.05 
## Searching right ...
## mtry = 12    OOB error = 2.047742e+13 
## -0.008038366 0.05

## choose the best mtry

rf_model <- randomForest(price ~ ., data = train[,-c(26,26)],mtry=8,importance=TRUE)

We see that the best option for mtry is 8, see we create our random forest with that. or mtry= 8

comparison of methods

To compare the models, we will assess the Root Mean Squared Error (RMSE) for each method when predicting on unseen or the test data.

# Create a data.table to compare the values
comparison_table <- data.table(
  "Method" = c("Lasso"," Random Forest"),
  "RMSE" = c(rmse,rmse2)
)
# Apply formatting to the 'rmse' column to separate thousands and millions
comparison_table$RMSE <- format(comparison_table$RMSE, big.mark = ",", scientific = FALSE)
comparison_table %>%
  kable(caption = "Comparison of Root Mean Squared Error (RMSE)") 
Comparison of Root Mean Squared Error (RMSE)
Method RMSE
Lasso 5,826,373
Random Forest 4,422,079

As we can see from the tabale the random forest model outperformed in forecasting property prices. The lasso model is on average 5.82 million from the true value and the random forest model 4.42 million. Considering Icelandic property prices, a deviation of plus or minus 4 million is not significant. Therefore, the model demonstrates considerable success in predicting prices.