'''
Importing standard python libraries used.
'''
import csv
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="darkgrid")


'''
Load PPR-ALL.csv into a pandas DataFrame. Only load the following columns:
    Date of Sale (dd/mm/yyyy), Address, County, Price (), VAT Exclusive, Not Full Market Price, Description of Property
Load HPM09.<time_of_download>.csv into a pandas DataFrame
'''

# define columns to load from the PPR-ALL.csv file
# note, the column heading "Price ()" has been to "Price" to avoid unicode errors
# note, the column heading "Date of Sale (dd/mm/yyyy)" has been changed to "Date" for simplicity
ppr_columns = [
    "Date", 
    "Address",
    "County",
    "Price", 
    "VAT Exclusive", 
    "Not Full Market Price", 
    "Description of Property"
]

# use pandas read_csv method to load in desired columns from PPR-ALL.csv into a dataframe
property_prices = pd.read_csv("source_data/PPR-ALL.csv",usecols=ppr_columns)

# use pandas read_csv method to load in all columns from HPM09.20210716T170716.csv into a dataframe
residential_price_index = pd.read_csv("source_data/HPM09.20210716T170716.csv")


'''
Property Price Data Cleaning
For the assessment use data from 2010 to 2019 (inclusive). Print length of DataFrame.
Filter out all counties except for the following Dublin, Cork, Galway, Kildare and Meath. Print length of DataFrame.
'''

#  convert Date from string to datetime in property_prices
property_prices["Date"] = pd.to_datetime(property_prices["Date"], format="%d/%m/%Y")

# create mask (boolean indicator for each row in dataframe) for dates between 01/01/2010 - 31/12/2019
mask = (property_prices["Date"] >= "2010-01-01") & (property_prices["Date"] < "2020-01-01")

# filter property_prices by date mask
property_prices = property_prices.loc[mask]

# print property_prices length
print("Property prices (2010-2019) data length:", len(property_prices))

# define counties to include
counties = [
    "Dublin", 
    "Cork", 
    "Galway", 
    "Kildare", 
    "Meath"
]

# create mask (boolean indicator for each row in dataframe) for counties in counties list
mask = property_prices["County"].isin(counties)

# filter property_prices by county mask
property_prices = property_prices.loc[mask]

# print property_prices length
print("Property prices (2010-2019) data in Dublin, Cork, Galway, Kildare and Meath has length:", len(property_prices))

Property prices (2010-2019) data length: 408696
Property prices (2010-2019) data in Dublin, Cork, Galway, Kildare and Meath has length: 234276


'''
RPI Data Cleaning
For the assessment use data from 2010 to 2019 (inclusive). Print length of DataFrame.
Include only Residential Property Price Index statistic. Print length of DataFrame.
Remove type of residential properties that are not either Dublin - all residential properties or National excluding Dublin - all residential properties.
'''

# convert Month from string to datetime in residential_price_index
residential_price_index["Month_Timestamp"] = pd.to_datetime(residential_price_index["Month"], format="%YM%m")

# filter residential_price_index for dates between 01/01/2010 - 31/12/2019
residential_price_index = residential_price_index.query("Month >= '2010-01-01' & Month < '2020-01-01'")

# print residential_price_index length
print("Residential price index data (2010-2019) length:", len(residential_price_index))

# filter residential_price_index for "Residential Property Price Index" statistic
residential_price_index = residential_price_index.query("Statistic == 'Residential Property Price Index'")

# print residential_price_index length
print("Residential price index data (2010-2019, RPI only) length:", len(residential_price_index))

# filter residential_price_index for either:
# Dublin - all residential properties or National excluding Dublin - all residential properties
residential_price_index = residential_price_index.query(
    "`Type of Residential Property` == 'Dublin - all residential properties' | \
     `Type of Residential Property` == 'National excluding Dublin - all residential properties'"
)

# print residential_price_index length
print(
    "Residential price index data (2010-2019, RPI only) length,",
    "\nDublin - all residential properties & National excluding Dublin - all residential properties:" ,
    len(residential_price_index)
)

Residential price index data (2010-2019) length: 9600
Residential price index data (2010-2019, RPI only) length: 2400
Residential price index data (2010-2019, RPI only) length, 
Dublin - all residential properties & National excluding Dublin - all residential properties: 240


'''
Note, perform any additional steps that may be warranted based on your understanding of the data that you have observed from the data itself and/or from the information provided in the links.
'''

# there are two types of property in the property_prices data, new and second hand
# sometimes the property description is in English, sometimes in Irish
# to simplify this distinction a boolean column, "New", will be used to catogrise dwellings as new (True) or second hand (False)

# defining a list of words that denote a property as new, these words only exist in the description of new  properies
new_words = ["New", "Nua"]

# create boolean indicator for each row in dataframe for property new status (True or False)
new = property_prices["Description of Property"].str.contains('|'.join(new_words))

# create "New" column in property_prices data and set it equal to the new indicator
property_prices["New"] = new

# convert price data from string to float
property_prices["Price"] = property_prices["Price"].str[1:].replace(',','', regex=True).astype(float)
residential_price_index["VALUE"] = residential_price_index["VALUE"].astype(float)

# On the website for the Property Price data it says:
# "If the property is a new property, the price shown should be exclusive of VAT at 13.5%."
# When re-selling this house it will be subject to VAT; therefore to accurately compare 
# property values it would be useful to have the VAT free price for all properties
VAT = 0.135
property_prices["Sale_Price_ex_VAT"] = np.where(
    property_prices["VAT Exclusive"] == 'Yes', 
    property_prices["Price"],
    property_prices["Price"] / (1 + VAT)
)

# simplify "Type of Residential Property" description as only 2 types remain
residential_price_index["Type of Residential Property"] = np.where(
    residential_price_index["Type of Residential Property"] == 'Dublin - all residential properties', 
    "Dublin",
    "National (ex Dublin)"
)


'''
Combine Data
Combine both data sources in such a way that type of residential properties categorisations are aligned appropriately
'''

# In order to merge the data we need keys in the data to join on. The RPI data is split up
# by location, month & dwelling catagory (house, apartment). In order to merge on these feilds
# we need the the common feilds to match in both data sources. The dates in property_prices need
# be adjusted to the start of the month to match the RPI data.

# create adjusted timestamp for the start of the month (to match with RPI data) to perform left join on
property_prices["RPI_Month"] = property_prices["Date"].apply(lambda x : x.replace(day=1))

# there isn't enough information in the property_prices to decifer between houses and apartments
# therefore the best way to split up the RPI data to match to the property_prices data is into 2 
# catagories: "Dublin" & "National (ex Dublin)"

# create adjusted location to match with RPI data to perform left join on
property_prices["RPI_Category"] = np.where(
    property_prices["County"] == 'Dublin', 
    "Dublin",
    "National (ex Dublin)"
)

# Left join property_prices & residential_price_index on category (Dublin/ non-Dublin) & date (RPI month)
prices_with_rpi = pd.merge(
    property_prices, 
    residential_price_index[["Month_Timestamp", "Type of Residential Property", "VALUE"]], 
    how='left', left_on=['RPI_Category','RPI_Month'], 
    right_on = ['Type of Residential Property','Month_Timestamp']
)


'''
Estimate sale prices so that they are all comparable to December 2019 prices by using the RPI information
'''

# get RPI value for December 2019 for Dublin properties
RPI_Dublin_DEC2019 = float(
    residential_price_index.query(
        "Month == '2019M12' & \
        `Type of Residential Property` == 'Dublin'"
    )["VALUE"]
)

# get RPI value for December 2019 for non-Dublin properties
RPI_Non_Dublin_DEC2019 = float(
    residential_price_index.query(
        "Month == '2019M12' & \
        `Type of Residential Property` == 'National (ex Dublin)'"
    )["VALUE"]
)

# create column for Dec 2019 RPI value and append correct value based on property type (Dublin/Non-Dublin)
prices_with_rpi["RPI_DEC2019"] = np.where(
    prices_with_rpi["County"] == 'Dublin', 
    RPI_Dublin_DEC2019,
    RPI_Non_Dublin_DEC2019
)

# estimate Dec 2019 price for all dwelling based on % change of RPI from date of sale
prices_with_rpi["Est_Price_Dec2019"] = (prices_with_rpi["RPI_DEC2019"] / prices_with_rpi["VALUE"]) * prices_with_rpi["Sale_Price_ex_VAT"]

# remane columns to make them more understandable
prices_with_rpi = prices_with_rpi.rename(columns={
    "Price": "Sale_Price",
    "Type of Residential Property": "Location",
    "VALUE": "RPI_at_Sale",
    "": ""
})

# drop unnecessary columns
prices_with_rpi = prices_with_rpi.drop(['Month_Timestamp'], axis=1)


'''
Examples include the change in property prices over time (adjusted and unadjusted for inflation), the number of properties sold, the distribution characteristics of prices and the difference between counties and types of properties.

Here I load in monthly CPI (consumer price index) relative to Dec 2016 (I set this relative to Jan 2015)
'''

# Inflation rate in Ireland taken from https://data.cso.ie/table/CPM16
inflation_rate = pd.read_csv("source_data/CPM16.20210719T120750.csv")

# filter inflation_rate for dates between 01/01/2010 - 31/12/2019
inflation_rate = inflation_rate.query("Month >= '2010-01-01' & Month < '2020-01-01'")

# filter inflation_rate for 'Consumer Price Index' & 'All items'
inflation_rate = inflation_rate.query("Statistic == 'Consumer Price Index' & `Detailed Sub Indices` == 'All items'")

# drop unnecessary columns
inflation_rate = inflation_rate.drop(['Statistic', 'Detailed Sub Indices', 'UNIT'], axis=1)

# set reference inflation rate to Jan 2010
inflation_rate["VALUE"] = (inflation_rate["VALUE"] / inflation_rate.iloc[0]["VALUE"]) *100

# give VALUE a meaningful name
inflation_rate = inflation_rate.rename(columns={'VALUE': 'CPI'})

# Left join residential_price_index & inflation_rate on category (Dublin/ non-Dublin) & date (RPI month)
residential_price_index = pd.merge(
    residential_price_index, 
    inflation_rate, 
    how='left',
    left_on='Month', 
    right_on ='Month'
)

# create an inflation adjusted RPI column
residential_price_index["inflation_adjusted_VALUE"] = (residential_price_index["VALUE"] / residential_price_index["CPI"]) * 100
residential_price_index["Type - inflation adjusted"] = residential_price_index["Type of Residential Property"].astype(str) + " inflation adjusted"

# convert Month from string to datetime in inflation_rate
inflation_rate["Month_Timestamp"] = pd.to_datetime(inflation_rate["Month"], format="%YM%m")

# Left join prices_with_rpi & inflation_rate on category (Dublin/ non-Dublin) & date (RPI month)
prices_with_rpi = pd.merge(
    prices_with_rpi, 
    inflation_rate, 
    how='left',
    left_on='RPI_Month', 
    right_on ='Month_Timestamp'
)

# create an inflation adjusted RPI column
prices_with_rpi["inflation_adjusted_Sale_Price"] = (prices_with_rpi["Sale_Price"] / prices_with_rpi["CPI"]) * 100
prices_with_rpi["Location - inflation adjusted"] = prices_with_rpi["Location"].astype(str) + " inflation adjusted"


# setup matplotlib subplots
fig, ax = plt.subplots(figsize=(12, 6.75))

# plot RPI data using seaborn
g = sns.lineplot(
    ax=ax,
    data=residential_price_index,
    x="Month_Timestamp",
    y="VALUE",
    hue="Type of Residential Property",
    palette=sns.color_palette("dark", n_colors=2)
)

# plot inflation adjusted RPI data using seaborn
g = sns.lineplot(
    ax=ax,
    data=residential_price_index,
    x="Month_Timestamp",
    y="inflation_adjusted_VALUE",
    hue="Type - inflation adjusted",
    palette=sns.color_palette("pastel", n_colors=2)
)

# set plot properties
_ = plt.xlabel("Month")
_ = plt.ylabel("RPI %")
_ = plt.title("Monthly Residential Property Price Index - Relative to 2015 Average")
_ = ax.legend(title="Legend")


# setup matplotlib subplots
fig, ax = plt.subplots(1, 2, figsize=(24, 6.75))

# group data by month and location & calculate quatity of sales
property_sold_by_month = prices_with_rpi.groupby(['RPI_Month', 'Location']).size().reset_index(name='counts')
g = sns.lineplot(
    ax=ax[0],
    data=property_sold_by_month,
    x="RPI_Month",
    y="counts",
    hue="Location",
    palette=sns.color_palette("deep", n_colors=2)
)

# set plot properties
_ = plt.sca(ax[0])
_ = plt.xlabel("Month")
_ = plt.ylabel("Qty")
_ = ax[0].legend(title="Legend")
_ = plt.title("Monthly Quantity of Residential Properties Sold")

# group data by month and (new/second hand) & calculate quatity of sales
property_sold_by_month = prices_with_rpi.groupby(['RPI_Month', 'New']).size().reset_index(name='counts')
g = sns.lineplot(
    ax=ax[1],
    data=property_sold_by_month,
    x="RPI_Month",
    y="counts",
    hue="New",
    palette=sns.color_palette("deep", n_colors=2)
)

# set plot properties
_ = plt.sca(ax[1])
_ = plt.xlabel("Month")
_ = plt.ylabel("Qty")
_ = ax[1].legend(title="Legend", labels=["Second Hand", "New"])
_ = plt.title("Monthly Quantity of Residential Properties Sold")


# setup matplotlib subplots
fig, ax = plt.subplots(1, 2, figsize=(24, 6.75))

# group data by month and location & calculate mean sale price
mean_price_sold_by_month = prices_with_rpi.groupby(
    ['RPI_Month', 'Location'])["Sale_Price"].mean().reset_index(name='mean_sale_price'
)

# group data by month and location & calculate inflation adjusted mean sale price
inflation_adjusted_mean_price_sold_by_month = prices_with_rpi.groupby(
    ['RPI_Month', 'Location - inflation adjusted'])["inflation_adjusted_Sale_Price"].mean().reset_index(name='inflation_adjusted_mean_sale_price'
)

# plot mean_sale_price using seaborn
g = sns.lineplot(
    ax=ax[0],
    data=mean_price_sold_by_month,
    x="RPI_Month",
    y="mean_sale_price",
    hue="Location",
    palette=sns.color_palette("dark", n_colors=2),
    legend="full"
)

# plot inflation_adjusted_mean_sale_price using seaborn
g = sns.lineplot(
    ax=ax[0],
    data=inflation_adjusted_mean_price_sold_by_month,
    x="RPI_Month",
    y="inflation_adjusted_mean_sale_price",
    hue="Location - inflation adjusted",
    palette=sns.color_palette("pastel", n_colors=2),
    legend="full"
)

# set plot properties
_ = plt.sca(ax[0])
_ = plt.xlabel("Month")
_ = plt.ylabel("mean sale price €")
_ = plt.legend(title="Legend")
_ = plt.title("Monthly Mean Sale Price")

# group data by month and location & calculate median sale price
median_price_sold_by_month = prices_with_rpi.groupby(
    ['RPI_Month', 'Location'])["Sale_Price"].median().reset_index(name='median_sale_price'
)

# group data by month and location & calculate inflation adjusted median sale price
inflation_adjusted_median_price_sold_by_month = prices_with_rpi.groupby(
    ['RPI_Month', 'Location - inflation adjusted'])["inflation_adjusted_Sale_Price"].median().reset_index(name='inflation_adjusted_median_sale_price'
)

# plot median_sale_price using seaborn
g = sns.lineplot(
    ax=ax[1],
    data=median_price_sold_by_month,
    x="RPI_Month",
    y="median_sale_price",
    hue="Location",
    palette=sns.color_palette("dark", n_colors=2),
    legend="full"
)

# plot inflation_adjusted_median_sale_price using seaborn
g = sns.lineplot(
    ax=ax[1],
    data=inflation_adjusted_median_price_sold_by_month,
    x="RPI_Month",
    y="inflation_adjusted_median_sale_price",
    hue="Location - inflation adjusted",
    palette=sns.color_palette("pastel", n_colors=2),
    legend="full"
)

# set plot properties
_ = plt.sca(ax[1])
_ = plt.xlabel("Month")
_ = plt.ylabel("median sale price €")
_ = plt.title("Monthly Median Sale Price")
_ = plt.legend(title="Legend")


# setup matplotlib subplots
fig, ax = plt.subplots(1, 2, figsize=(24, 6.75))

# group data by month and (new/second hand) & calculate mean sale price
mean_price_sold_by_month = prices_with_rpi.groupby(
    ['RPI_Month', 'New'])["Sale_Price"].mean().reset_index(name='mean_sale_price'
)

# plot mean_sale_price using seaborn
g = sns.lineplot(
    ax=ax[0],
    data=mean_price_sold_by_month,
    x="RPI_Month",
    y="mean_sale_price",
    hue="New",
    palette=sns.color_palette("deep", n_colors=2),
    legend="full"
)

# set plot properties
_ = plt.sca(ax[0])
_ = plt.xlabel("Month")
_ = plt.ylabel("mean sale price €")
_ = plt.title("Monthly Mean Sale Price")
_ = ax[0].legend(title="Legend", labels=["Second Hand", "New"])

# group data by month and (new/second hand) & calculate median sale price
median_price_sold_by_month = prices_with_rpi.groupby(
    ['RPI_Month', 'New'])["Sale_Price"].median().reset_index(name='median_sale_price'
)

# plot median_sale_price using seaborn
g = sns.lineplot(
    ax=ax[1],
    data=median_price_sold_by_month,
    x="RPI_Month",
    y="median_sale_price",
    hue="New",
    palette=sns.color_palette("deep", n_colors=2),
    legend="full"
)

# set plot properties
_ = plt.sca(ax[1])
_ = plt.xlabel("Month")
_ = plt.ylabel("median sale price €")
_ = plt.title("Monthly Median Sale Price")
_ = ax[1].legend(title="Legend", labels=["Second Hand", "New"])


# setup matplotlib subplots
fig, ax = plt.subplots(1, 2, figsize=(24, 6.75))

# group data by month and county & calculate mean sale price
mean_price_sold_by_month = prices_with_rpi.groupby(
    ['RPI_Month', 'County'])["Sale_Price"].mean().reset_index(name='mean_sale_price'
)

# plot mean_sale_price using seaborn
g = sns.lineplot(
    ax=ax[0],
    data=mean_price_sold_by_month,
    x="RPI_Month",
    y="mean_sale_price",
    hue="County",
    palette=sns.color_palette("deep", n_colors=5),
    legend="full"
)

# set plot properties
_ = plt.sca(ax[0])
_ = plt.xlabel("Month")
_ = plt.ylabel("mean sale price €")
_ = plt.title("Monthly Mean Sale Price")
_ = ax[0].legend(title="Legend")

# group data by month and county & calculate median sale price
median_price_sold_by_month = prices_with_rpi.groupby(
    ['RPI_Month', 'County'])["Sale_Price"].median().reset_index(name='median_sale_price'
)

# plot median_sale_price using seaborn
g = sns.lineplot(
    ax=ax[1],
    data=median_price_sold_by_month,
    x="RPI_Month",
    y="median_sale_price",
    hue="County",
    palette=sns.color_palette("deep", n_colors=5),
    legend="full"
)

# set plot properties
_ = plt.sca(ax[1])
_ = plt.xlabel("Month")
_ = plt.ylabel("median sale price €")
_ = plt.title("Monthly Median Sale Price")
_ = ax[1].legend(title="Legend")


# setup matplotlib subplots
fig, ax = plt.subplots(1, 2, figsize=(24, 6.75))

# plot histograms of the estimated 2019 price for all properties split by loaction (Dublin/non-Dublin)
_ = prices_with_rpi.query("Est_Price_Dec2019 <= 2000000")['Est_Price_Dec2019'].hist(
    ax=[ax[0], ax[1]],
    by=prices_with_rpi['Location'],
    bins=50
)

# set plot properties
for iax in ax:
    _ = plt.sca(iax)
    _ = plt.xlabel("Price €")
    _ = plt.ylabel("Quantity Sold")


# setup matplotlib subplots
fig, ax = plt.subplots(2, 3, figsize=(24, 6.75))

# plot histograms of the estimated 2019 price for all properties split by county
_ = prices_with_rpi.query("Est_Price_Dec2019 <= 2000000")['Est_Price_Dec2019'].hist(
    ax=[ax[0, 0], ax[0, 1], ax[0, 2], ax[1, 0], ax[1, 1]],
    by=prices_with_rpi['County'],
    bins=50
)

# set plot properties
for iax in ax:
    for jax in iax:
        _ = plt.sca(jax)
        _ = plt.xlabel("Price €")
        _ = plt.ylabel("Quantity Sold")


# calculate the log of the estimated 2019 price
prices_with_rpi["log(Est_Price_Dec2019)"] = np.log(prices_with_rpi["Est_Price_Dec2019"])


# setup matplotlib subplots
fig, ax = plt.subplots(1, 2, figsize=(24, 6.75))

# plot histograms of the log of the estimated 2019 price for all properties split by county
_ = prices_with_rpi.query("Est_Price_Dec2019 <= 2000000")['log(Est_Price_Dec2019)'].hist(
    ax=[ax[0], ax[1]],
    by=prices_with_rpi['Location'],
    bins=50
)

# set plot properties
for iax in ax:
    _ = plt.sca(iax)
    _ = plt.xlabel("Price €")
    _ = plt.ylabel("Quantity Sold")


# setup matplotlib subplots
fig, ax = plt.subplots(2, 3, figsize=(24, 6.75))

# plot histograms of the log of the estimated 2019 price for all properties split by county
_ = prices_with_rpi.query("Est_Price_Dec2019 <= 2000000")['log(Est_Price_Dec2019)'].hist(
    ax=[ax[0, 0], ax[0, 1], ax[0, 2], ax[1, 0], ax[1, 1]],
    by=prices_with_rpi['County'],
    bins=50
)

# set plot properties
for iax in ax:
    for jax in iax:
        _ = plt.sca(jax)
        _ = plt.xlabel("Price €")
        _ = plt.ylabel("Quantity Sold")


'''
Here the z score is calculated for Dublin and ex Dublin properties. County wise z score calculation is not nessesary as the means of the histograms for all non-Dublin Counties are relatively close. 
'''

# create data frames to seperate Dublin and ex Dublin properties
prices_with_rpi_dublin = prices_with_rpi.query("County == 'Dublin'").reset_index()
prices_with_rpi_exdublin = prices_with_rpi.query("County != 'Dublin'").reset_index()

# calulate z score for Dublin properties
prices_with_rpi_dublin["zscore"] = (
    (prices_with_rpi_dublin["log(Est_Price_Dec2019)"] - prices_with_rpi_dublin["log(Est_Price_Dec2019)"].mean()) 
    / prices_with_rpi_dublin["log(Est_Price_Dec2019)"].std(ddof=0)
)

# calulate z score for ex Dublin properties
prices_with_rpi_exdublin["zscore"] = (
    (prices_with_rpi_exdublin["log(Est_Price_Dec2019)"] - prices_with_rpi_exdublin["log(Est_Price_Dec2019)"].mean()) 
    / prices_with_rpi_exdublin["log(Est_Price_Dec2019)"].std(ddof=0)
)

# filter out over priced properties with a z score >= 3
overpriced_dublin = prices_with_rpi_dublin.query("zscore >= 3").reset_index()
overpriced_exdublin = prices_with_rpi_exdublin.query("zscore >= 3").reset_index()
overpriced = pd.concat([overpriced_dublin, overpriced_exdublin])

# filter out under priced properties with a z score <= -3
underpriced_dublin = prices_with_rpi_dublin.query("zscore <= -3").reset_index()
underpriced_exdublin = prices_with_rpi_exdublin.query("zscore <= -3").reset_index()
underpriced = pd.concat([underpriced_dublin, underpriced_dublin])

# define columns to save
colums_to_save = [
    "Date", 
    "Address",
    "County",
    "Sale_Price", 
    "VAT Exclusive", 
    "Not Full Market Price", 
    "Description of Property"
]

# filter out un-necessary columns
overpriced = overpriced[colums_to_save]
underpriced = underpriced[colums_to_save]

# save lists to csv
overpriced.to_csv("overpriced.csv", index=False)
underpriced.to_csv("underpriced.csv", index=False)


# set stamp duty rate 
stamp_duty_rate = 0.01

# calculate stamp duty on all property sales
prices_with_rpi["Stamp_Duty"] = prices_with_rpi["Sale_Price_ex_VAT"] * stamp_duty_rate

# create year column in order to group by year
prices_with_rpi["Year"] = prices_with_rpi["RPI_Month"].apply(lambda x : x.replace(month=1))

# calculate monthly stamp duty
monthly_stamp_duty = prices_with_rpi.groupby(
    ['RPI_Month'])["Stamp_Duty"].sum().reset_index(name='Stamp_Duty_Total'
)

# calculate yearly stampt duty
yearly_stamp_duty = prices_with_rpi.groupby(
    ['Year'])["Stamp_Duty"].sum().reset_index(name='Stamp_Duty_Total'
)


# setup matplotlib subplots
fig, ax = plt.subplots(1, 2, figsize=(24, 6.75))

# plot monthly stamp duty using seaborn
g = sns.lineplot(
    ax=ax[0],
    data=monthly_stamp_duty,
    x="RPI_Month",
    y="Stamp_Duty_Total",
    palette=sns.color_palette("deep", n_colors=1)
)

# set plot properties
_ = plt.sca(ax[0])
_ = plt.xlabel("Month")
_ = plt.ylabel("Stamp Duty Total €")
_ = plt.title("Monthly Stamp Duty")

# plot yearly stamp duty using seaborn
g = sns.lineplot(
    ax=ax[1],
    data=yearly_stamp_duty,
    x="Year",
    y="Stamp_Duty_Total",
    palette=sns.color_palette("deep", n_colors=1)
)

# set plot properties
_ = plt.sca(ax[1])
_ = plt.xlabel("Year")
_ = plt.ylabel("Stamp Duty Total €")
_ = plt.title("Yearly Stamp Duty")


# poly fit line to predict trend line
# for each year I use the previous years with a minimum of 4 years to create a new poly line
# this means the first 4 years are not predicted

# create poly line for every year from 2014 - 2019 inclusive
poly_lines = [np.polyfit(yearly_stamp_duty.index[:i], yearly_stamp_duty["Stamp_Duty_Total"][:i], 2) for i in range(4, len(yearly_stamp_duty.index))]

# use poly line for 2014 for 2010 - 2014 (this means that these years are not predicted, they are fitted)
ps = [np.poly1d(poly_lines[0])]*4 + [np.poly1d(poly_line) for poly_line in poly_lines]

# calculate predicted yearly trend for each year
yearly_trend = [p(i) for i, p in enumerate(ps)]
yearly_stamp_duty["yearly_trend"] = yearly_trend

# calcularte yealy % change based on predicted values
yearly_stamp_duty["pct_change"] = yearly_stamp_duty["yearly_trend"].pct_change().fillna(0)

# setup matplotlib subplots
fig, ax = plt.subplots(figsize=(24, 6.75))

# plot yearly stamp duty using seaborn
g = sns.lineplot(
    ax=ax,
    data=yearly_stamp_duty,
    x="Year",
    y="Stamp_Duty_Total",
    palette=sns.color_palette("deep", n_colors=1)
)

# set plot properties
_ = plt.sca(ax)
_ = plt.xlabel("Year")
_ = plt.ylabel("Stamp Duty Total €")
_ = plt.title("Yearly Stamp Duty")

# plot yearly stamp duty using seaborn
g = sns.lineplot(
    ax=ax,
    data=yearly_stamp_duty,
    x="Year",
    y="yearly_trend",
    palette=sns.color_palette("deep", n_colors=2)[1]
)

# set plot properties
_ = plt.sca(ax)
_ = plt.xlabel("Year")
_ = plt.ylabel("Stamp Duty Total €")
_ = plt.title("Yearly Stamp Duty")
_ = ax.legend(title="Legend", labels=["Stamp Duty", "Prediction"])


# create year column in order to match by year
monthly_stamp_duty["Year"] = monthly_stamp_duty["RPI_Month"].apply(lambda x : x.replace(month=1))

# Left join monthly_stamp_duty & yearly_stamp_duty on year to get % chnage
monthly_stamp_duty = pd.merge(
    monthly_stamp_duty, 
    yearly_stamp_duty[["Year", "pct_change"]], 
    how='left',
    left_on='Year', 
    right_on ='Year'
)


# calculate seasonal naive value
monthly_stamp_duty["naive_prediction"] = monthly_stamp_duty["Stamp_Duty_Total"].shift(12)

# calculate seasonal naive value with yearly trend % change multiplier
monthly_stamp_duty["naive_prediction_trend"] = (monthly_stamp_duty["Stamp_Duty_Total"] * (1 + monthly_stamp_duty["pct_change"])).shift(12)

# setup matplotlib subplots
fig, ax = plt.subplots(1, 2, figsize=(24, 6.75))

# plot monthly stamp duty using seaborn
g = sns.lineplot(
    ax=ax[0],
    data=monthly_stamp_duty[60:],
    x="RPI_Month",
    y="Stamp_Duty_Total",
    palette=sns.color_palette("deep", n_colors=1)
)

# plot naive prediction using seaborn
g = sns.lineplot(
    ax=ax[0],
    data=monthly_stamp_duty[60:],
    x="RPI_Month",
    y="naive_prediction",
    palette=sns.color_palette("deep", n_colors=2)[1]
)

# set plot properties
_ = plt.sca(ax[0])
_ = plt.xlabel("Month")
_ = plt.ylabel("Stamp Duty Total €")
_ = plt.title("Monthly Stamp Duty Prediction Seasonal Naive Model")
_ = ax[0].legend(title="Legend", labels=["Stamp Duty", "Prediction"])

# plot monthly stamp duty using seaborn
g = sns.lineplot(
    ax=ax[1],
    data=monthly_stamp_duty[60:],
    x="RPI_Month",
    y="Stamp_Duty_Total",
    palette=sns.color_palette("deep", n_colors=1)
)

# plot naive prediction using seaborn
g = sns.lineplot(
    ax=ax[1],
    data=monthly_stamp_duty[60:],
    x="RPI_Month",
    y="naive_prediction_trend",
    palette=sns.color_palette("deep", n_colors=2)[1]
)

# set plot properties
_ = plt.sca(ax[1])
_ = plt.xlabel("Month")
_ = plt.ylabel("Stamp Duty Total €")
_ = plt.title("Monthly Stamp Duty Prediction Seasonal Naive Model with Trend")
_ = ax[1].legend(title="Legend", labels=["Stamp Duty", "Prediction"])


# calculate the squared error for the seasonal naive model
monthly_stamp_duty["SN_squared_error"] = (monthly_stamp_duty["Stamp_Duty_Total"] - monthly_stamp_duty["naive_prediction"]) ** 2

# calculate the squared error for the seasonal naive model with trend
monthly_stamp_duty["SNT_squared_error"] = (monthly_stamp_duty["Stamp_Duty_Total"] - monthly_stamp_duty["naive_prediction_trend"]) ** 2

# calculate the root mean squared error for the seasonal naive model
RMSE_SN = np.sqrt(monthly_stamp_duty["SN_squared_error"].mean())

# calculate the mean squared error for the seasonal naive model with trend
RMSE_SNT = np.sqrt(monthly_stamp_duty["SNT_squared_error"].mean())

# calculate the mean absolute error by % for the seasonal naive model
mean_abs_error_pct_sn = (
    abs(monthly_stamp_duty["Stamp_Duty_Total"] - monthly_stamp_duty["naive_prediction"])/monthly_stamp_duty["Stamp_Duty_Total"]
).mean()

# calculate the mean absolute error by % for the seasonal naive model with trend
mean_abs_error_pct_snt = (
    abs(monthly_stamp_duty["Stamp_Duty_Total"] - monthly_stamp_duty["naive_prediction_trend"])/monthly_stamp_duty["Stamp_Duty_Total"]
).mean()


print("RMSE for seasonal naive model:", round(RMSE_SN,2))
print("Mean abs error % for seasonal naive model:", round(mean_abs_error_pct_snt * 100,2), "%")
print("RMSE for seasonal naive model with trend:", round(RMSE_SNT,2))
print("Mean abs error % for seasonal naive model with trend:", round(mean_abs_error_pct_sn * 100,2), "%")

RMSE for seasonal naive model: 1463917.31
Mean abs error % for seasonal naive model: 21.83 %
RMSE for seasonal naive model with trend: 1642630.98
Mean abs error % for seasonal naive model with trend: 20.72 %

Case Study¶

Data¶

1. Download and load datasets¶

2. Data Cleaning & Processing¶

3. Exploratory Data Analysis¶

Figure 1 Monthly Residential Property Price Index - Relative to 2015 Average¶

Figure 2 Monthly Quantity of Residential Properties Sold¶

Figure 3 Monthly Mean & Median Sale Price by location (Dublin vs non-Dublin)¶

Figure 4 Monthly Mean & Median Sale Price by (New vs Second Hand)¶

Figure 5 Monthly Mean & Median Sale Price by County¶

Figure 6 Histogram of the the Estimated December 2019 Price for all Properties by Location¶

Figure 7 Histogram of the the Estimated December 2019 Price for all Properties by County¶

4. Outlier Detection¶

Z-Score Outlier Detection¶

Figure 8 Histogram of the Log of the Estimated December 2019 Price for all Properties by Location¶

Figure 9 Histogram of the Log Root of the Estimated December 2019 Price for all Properties County¶

5. Forecasting Stamp Duty¶

Forecasting Stamp Duty¶

Methodology & Figure 10 Monthly & Yearly Stamp Duty¶

Model selection¶

Figure 11 Yearly Stamp Duty with Yearly Prediction¶

Figure 12 Monthly Stamp Duty Prediction Seasonal Naive Model with/without Trend¶

Summary of Findings¶