# Set up
import pandas as pd
import geopandas as gpd
import numpy as np
from matplotlib import pyplot as plt
import seaborn
from sklearn.neighbors import NearestNeighbors

import warnings
warnings.filterwarnings('ignore')

# For model building
# Linear models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge

# Model selection
from sklearn.model_selection import train_test_split

# Pipelines
from sklearn.pipeline import make_pipeline

# Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score

# For statistical summaries
import statsmodels.api as sm
import statsmodels.formula.api as smf

# For Moran's I and LISA statistics
from libpysal.weights.contiguity import Queen
from esda.moran import Moran
from splot import esda as esdaplot
from pysal.lib import weights
from esda.moran import Moran_Local

# read in dataset
raw_data = gpd.read_file("data/studentData.geojson")


# Review dataset
raw_data.head()
raw_data.columns

# Pare down columns - select columns to keep and convert to a geographic object
cols_wanted = ["municipali", "dateofsale", "price", "yearbuilt", "heatedarea", "cdebuildin", "descbuildi", "storyheigh", "aheatingty", "heatedfuel", "actype", "extwall", "foundation", "numfirepla", "bldggrade", "fullbaths",  "halfbaths", "bedrooms", "units", "shape_Leng", "shape_Area", "sale_year", "toPredict", "geometry"]
cleaning_data = raw_data[cols_wanted]
cleaning_data = gpd.GeoDataFrame(cleaning_data).to_crs("EPSG:4326")

# Remove MUSA challenge houses
#cleaning_data['toPredict'].unique() # see what the options are
cleaning_data = cleaning_data.loc[cleaning_data['toPredict'] == 'MODELLING'] 

# View pared down dataset 
cleaning_data.head()

# Review the datatypes for each column - this can help guide some of our cleaning.
cleaning_data.dtypes

municipali      object
dateofsale      object
price          float64
yearbuilt        int64
heatedarea     float64
cdebuildin      object
descbuildi      object
storyheigh      object
aheatingty      object
heatedfuel      object
actype          object
extwall         object
foundation      object
numfirepla     float64
bldggrade       object
fullbaths        int64
halfbaths        int64
bedrooms       float64
units            int64
shape_Leng     float64
shape_Area     float64
sale_year      float64
toPredict       object
geometry      geometry
dtype: object


# Clean building codes - metadata here: http://maps.co.mecklenburg.nc.us/opendata/metadata/tax_parcels_with_cama_data.xml
cleaning_data["cdebuildin"].unique()

#How many differeny types of homes in each category?
cleaning_data.groupby('cdebuildin').size()

# The vast majority (45782) of homes are single family homes. Will limit our analysis to just Single Family Residential homes (cdebuildin = 01)
cleaning_data = cleaning_data.loc[cleaning_data['cdebuildin'] == '01'] 

#How many differeny types of homes in each category within descbuildi?
cleaning_data.groupby('descbuildi').size()
# All are RES - so we can get rid of this feature (later)

# Convert dateofsale to a date
cleaning_data['dateofsale'] = pd.to_datetime(cleaning_data['dateofsale'])
cleaning_data.head()


# will need to convert storeyheigh to numeric variable
cleaning_data.groupby('storyheigh').size()

# remove instances without specific storey height
levels_dont_want = ["BI-LEVEL", "CAPE COD", "RANCH W/BSMT", "SPLIT LEVEL"]
levels_want = ["1 STORY", "1.5 STORY", "2.0 STORY", "2.5 STORY", "3.0 STORY", ">=4.0 STORY"]

cleaning_data = cleaning_data.loc[cleaning_data["storyheigh"].isin(levels_want)]

# Convert story levels to numeric
levels_numeric = {"1 STORY":1, "1.5 STORY":1.5, "2.0 STORY":2, "2.5 STORY":2.5, "3.0 STORY":3, ">=4.0 STORY":4}

cleaning_data["storyheigh"] = cleaning_data["storyheigh"].replace(levels_numeric)

#cleaning_data.groupby("storyheigh").size()
cleaning_data.head()


# How much variation is there in price?
pd.options.display.float_format = '{:20,.2f}'.format # turn off scientific notation
cleaning_data["price"].describe()

len(cleaning_data["price"].loc[cleaning_data["price"] < 2e6])
# just 444 homes sold for over $2 million. I'm going to limit my prediction to just homes that sold for 2 million or less.

cleaning_data = cleaning_data.loc[cleaning_data["price"] <= 2e6]


# # # Year Built # # #
# Yearbuilt - any cleaning needed?
cleaning_data.groupby("yearbuilt").size()

# Yes - remove the one home with a yearbuilt as 0. Otherwise, the earliest home is from 1897, which is reasonable.
cleaning_data = cleaning_data.loc[cleaning_data["yearbuilt"] != 0]

# # # Heated Area # # #
# Heatedarea - any cleaning needed?
cleaning_data["heatedarea"].describe()

# Yes - there is at least one home with 0 heated area. 
len(cleaning_data.loc[cleaning_data["heatedarea"] < 500])

# Three homes with less than 500 sqft. I'll remove them from the dataset.

cleaning_data = cleaning_data.loc[cleaning_data["heatedarea"] >= 500]

# # # Heating Type # # #
# aheatingty- any cleaning needed? Might not use this.
cleaning_data.groupby("aheatingty").size() # most homes are air-ducted. It probably won't help the prediction to include this as a predictor.

# # # Heating Fuel # # #
# heatedfuel - might not use this either
cleaning_data.groupby("heatedfuel").size()

# # # AC Type # # #
# actype - I could engineer this into a binary classification: has AC/has no AC. But the nones are so small compared to the larger pool, that it may not be worth it.
# but why not.
cleaning_data.groupby("actype").size()

ac_conditions = [(cleaning_data["actype"] == ("AC-NONE")),
                 (cleaning_data["actype"] != ("AC-NONE"))]

ac_values = ["no_AC", "has_AC"]

cleaning_data["has_ac"] = np.select(ac_conditions, ac_values)

cleaning_data["has_ac"].head()

cleaning_data.groupby("has_ac").size()

# # # Exterior Wall # # #
# extwall - probably won't use
cleaning_data.groupby("extwall", dropna=False).size()

# # # Foundation # # #
#foundation - probably won't use
cleaning_data.groupby("foundation", dropna=False).size()

# # # Fireplaces # # #
# numfirepla - most homes have one fire place. I will divide this into three categories: no fireplce, one fireplace, or more than one fireplace.
cleaning_data.groupby("numfirepla", dropna=False).size()

fireplace_conditions = [cleaning_data['numfirepla'] == 0,
                        cleaning_data['numfirepla'] == 1,
                        cleaning_data['numfirepla'] > 1]

fireplace_categories = ["no_fireplace", "one_fireplace", "2+_fireplaces"]

cleaning_data["fireplace_cat"] = np.select(fireplace_conditions, fireplace_categories)

cleaning_data.groupby("fireplace_cat").size()

# # # Building Grades # # #
#bldggrade - will create two categories: average or better vs below average.
cleaning_data.groupby("bldggrade", dropna=False).size()

good_shape = ["AVERAGE", "CUSTOM", "EXCELLENT", "GOOD", "VERY GOOD"]
bad_shape = ["FAIR", "MINIMUM"]

bldggrade_conditions = [cleaning_data["bldggrade"].isin(good_shape),
                        cleaning_data["bldggrade"].isin(bad_shape)]

bldggrade_results = ["above_average", "below_average"]

cleaning_data["building_state"] = np.select(bldggrade_conditions, bldggrade_results)

cleaning_data.groupby("building_state").size()

building_state
0                   13
above_average    43059
below_average      664
dtype: int64


# how many null values do we have in each column?
# aheating, heatedfuel, actype, and extwall have the largest NA values - good reasons not to use them (since I was already inclined towards that anyway), although the NAs are small in number.
cleaning_data.isnull().sum()

# remove nas
cleaning_data = cleaning_data.dropna()

cleaning_data.isnull().sum()

municipali        0
dateofsale        0
price             0
yearbuilt         0
heatedarea        0
cdebuildin        0
descbuildi        0
storyheigh        0
aheatingty        0
heatedfuel        0
actype            0
extwall           0
foundation        0
numfirepla        0
bldggrade         0
fullbaths         0
halfbaths         0
bedrooms          0
units             0
shape_Leng        0
shape_Area        0
sale_year         0
toPredict         0
geometry          0
has_ac            0
fireplace_cat     0
building_state    0
dtype: int64


# # # Full Baths # # #
# fullbaths - doesn't seem to need any cleaning.
cleaning_data.groupby('fullbaths', dropna=False).size()

# # # Half Baths # # #
# Half baths - appears to be in order
cleaning_data.groupby("halfbaths", dropna=False).size()

# # # Bedrooms # # #
#  'bedrooms', # 0, 50, and 65 bedrooms seems ridiculous - will remove.
cleaning_data.groupby("bedrooms", dropna=False).size()

cleaning_data['bedrooms'] = cleaning_data['bedrooms'].loc[(cleaning_data['bedrooms'] < 50) & (cleaning_data['bedrooms'] > 0)]

cleaning_data.dropna().groupby("bedrooms", dropna=False).size()

cleaning_data = cleaning_data.dropna()

# # # Units # # #
#  'units' # I don't know why so many are listed as having 0 units. 
#When I compare their values to others, it appears that they are much newer than other homes (average year built is 2020ish, vs 1990s for 1-unit dwellings).
# So this could be a data entry issue. I will opt not to use this feature.

cleaning_data.groupby("units", dropna=False).size()

# # # Sale Year # # #
#  'sale_year' - sale years range from 2020-2028. 2028 is clearly a mistake (And just one entry). Will remove that one and keep the others.
# Will also convert the data type to a date.
cleaning_data.groupby("sale_year").size()

cleaning_data["sale_year"] = cleaning_data["sale_year"].dropna().loc[cleaning_data["sale_year"] < 2028]
cleaning_data["sale_year"] = pd.to_datetime(cleaning_data["sale_year"], format = '%Y').dt.strftime("%Y")

cleaning_data.dropna().groupby("sale_year", dropna=False).size()

sale_year
2020    15610
2021    17525
2022    10349
dtype: int64


# After all that, there are only a few columns I want to keep. They are:

final_cols = ["municipali",
              "dateofsale",
              "price",
              "yearbuilt",
              "heatedarea",
              "storyheigh",
              "has_ac",
              "fireplace_cat",
              "building_state",
              "fullbaths",
              "halfbaths",
              "bedrooms",
              "sale_year",
              "shape_Area",
              "geometry"]

# Confirm that all NAs have been dropped
cleaning_data = cleaning_data.dropna()
cleaning_data.isnull().sum()

home_sales = cleaning_data[final_cols]

home_sales.head()


# Map out homes by price

home_sales = gpd.GeoDataFrame(home_sales).to_crs("EPSG:4326")
pd.options.display.float_format = '{:20,.2f}'.format # turn off scientific notation

# Make a map of our selected homes
#Style
plt.style.use("seaborn-v0_8-whitegrid") #plt.style.available

# Plot
fig, ax = plt.subplots(figsize = (10,10))


home_sales.plot(ax = ax,
          column = "price",
          legend = True,
          cmap = "coolwarm",
          alpha = 0.3)

ax.set_title("Home Sale Prices, Mecklenburg NC", fontsize = 16)
ax.set_axis_off()


# Elementary schools
elementary = gpd.read_file("data/elementary_school/SchoolBoundary_Elementary.shp").to_crs("4326")
elementary.head()

# Elementary grades
elem_grades = pd.read_csv("data/elem_grade.csv")
elem_grades.head()

# Join the elementary schools dataset and the elementary schools' grades data sets
elementary = elementary.merge(elem_grades, on = "ELEM_NAME", how = "left")
elementary.head()

elementary.groupby("ELEM_Performance").size()

elementary.head()

# Map Elementary schools by performance

plt.style.use("seaborn-v0_8-whitegrid")

# Plot
fig, ax = plt.subplots(figsize = (10,10))


elementary.plot(ax = ax,
          column = "ELEM_Performance",
          legend = True,
          cmap = "coolwarm")

ax.set_title("Elementary Schools Performance, Mecklenburg NC", fontsize = 16)
ax.set_axis_off()


# Spatial join to specify which grade-level elementary school each home has available to them.

# first, filter to just our wanted columns
wanted_elem_cols = ["ELEM_NAME", "ELEM_Performance", "geometry"]
elementary = elementary[wanted_elem_cols]
#elementary.length

# Now there are duplicates - remove duplicates
#elementary = elementary.drop_duplicates('ELEM_NAME')
elementary.head()

# st join to home_sales
home_sales = home_sales.sjoin(elementary, how = "left")
home_sales.head()

# Map Home prices within Elementary schools' performance
plt.style.use("seaborn-v0_8-whitegrid")

# Plot
fig, ax = plt.subplots(figsize = (10,10))


elementary.plot(ax = ax,
          column = "ELEM_Performance",
          legend = True,
          cmap = "binary")

home_sales.plot(ax = ax,
          column = "price",
          legend = True,
          cmap = "coolwarm",
          alpha = 0.1,
          s = 3)

ax.set_title("Home prices and Area Elementary Schools Performance, Mecklenburg NC", fontsize = 16)
ax.set_axis_off()


# What's the average price for each school's performance level?
price_by_performance = home_sales.groupby('ELEM_Performance').mean('price').reset_index()

price_by_performance

# Graph results
fig = plt.figure(figsize = (10, 5))
 
# creating the bar plot
plt.bar(price_by_performance['ELEM_Performance'], price_by_performance['price'], color ='maroon',
        width = 0.4)
 
plt.xlabel("Elementary School Performance")
plt.ylabel("Mean Home Price")
plt.title("Average home prices by local elementary school performance, Mecklenberg NC")
plt.show()


# Prices by town

town_prices = home_sales.groupby('municipali').mean('price').reset_index()
#town_prices
plt.bar(town_prices['municipali'], town_prices['price'], color ='maroon',
        width = 0.4)
 
plt.xlabel("Town")
plt.xticks(rotation = 90)
plt.ylabel("Mean Home Price")
plt.title("Average home prices by town, Mecklenberg NC")
plt.show()


# Calculate LISAs (local indicators of spatial autocorrelation)

w_lisa = weights.distance.KNN.from_dataframe(home_sales, k = 5)
lisa = Moran_Local(home_sales["price"], w_lisa)

# kernel density estimate - i.e. visualize distribution of observations in a dataset 

ax = seaborn.kdeplot(lisa.Is)
seaborn.rugplot(lisa.Is, ax=ax)

<Axes: ylabel='Density'>


from splot import esda as esdaplot

# Set up figure and axes
f, axs = plt.subplots(nrows=2, ncols=2, figsize=(12, 12))
# Make the axes accessible with single indexing
axs = axs.flatten()

# Subplot 1 #
# Map of local statistics
# Grab first axis in the figure
ax = axs[0]
# Assign new column with local statistics on-the-fly
home_sales.assign(
    Is=lisa.Is
    # Plot local statistics
).plot(
    column="Is",
    cmap="plasma",
    scheme="quantiles",
    k=5,
    edgecolor="white",
    linewidth=0.1,
    alpha=0.3,
    legend=True,
    ax=ax,
)

# Subplot 2 #
# Quadrant categories
# Grab second axis of local statistics
ax = axs[1]
# Plot Quadrant colors (note to ensure all homes are assigned a
# quadrant, we "trick" the function by setting significance level to
# 1 so all observations are treated as "significant" and thus assigned
# a quadrant color
esdaplot.lisa_cluster(lisa, home_sales, p=1, ax=ax, alpha=0.3)

# Subplot 3 #
# Significance map
# Grab third axis of local statistics
ax = axs[2]
#
# Find out significant observations
labels = pd.Series(
    1 * (lisa.p_sim < 0.05),  # Assign 1 if significant, 0 otherwise
    index=home_sales.index  # Use the index in the original data
    # Recode 1 to "Significant and 0 to "Non-significant"
).map({1: "Significant", 0: "Non-Significant"})
# Assign labels to `home_sales` on the fly
home_sales.assign(
    cl=labels
    # Plot (non-)significant areas
).plot(
    column="cl",
    categorical=True,
    k=2,
    cmap="Paired",
    linewidth=0.1,
    edgecolor="white",
    legend=True,
    ax=ax,
    alpha=0.3
)


# Subplot 4 #
# Cluster map
# Grab second axis of local statistics
ax = axs[3]
# Plot Quadrant colors In this case, we use a 5% significance
# level to select polygons as part of statistically significant
# clusters
esdaplot.lisa_cluster(lisa, home_sales, p=0.05, ax=ax, alpha=0.3)

# Figure styling #
# Set title to each subplot
for i, ax in enumerate(axs.flatten()):
    ax.set_axis_off()
    ax.set_title(
        [
            "Local Statistics",
            "Scatterplot Quadrant",
            "Statistical Significance",
            "Moran Cluster Map",
        ][i],
        y=0,
    )
# Tight layout to minimize in-between white space
f.tight_layout()

# Display the figure
plt.show()


# How many exactly are HH/LL/HL/LW/non-sig?

home_sales["p_sim"] = lisa.p_sim
# 1 if significant (p <= 0.05), 0 otherwise
sig = 1 * (lisa.p_sim < 0.05)

# assign significance flag
home_sales["sig"] = sig

# assign what part of quadrant each home is in, but only if significant  - 0 otherwise for non-significant ones
spots = lisa.q * sig

spots_labels = {
    0 : "Non-significant",
    1 : "HH",
    2 : "LH",
    3 : "LL",
    4 : "HL"
}

# create column with labels
home_sales["labels"] = pd.Series(spots, index = home_sales.index
                                 ).map(spots_labels)

# See proportion of breakhown as percentages
home_sales['labels'].value_counts(normalize = True) * 100

labels
Non-significant                  65.36
LL                               22.70
HH                               11.12
LH                                0.51
HL                                0.32
Name: proportion, dtype: float64


# Feature engineering

# Spatial lag of home prices - what is the average price of the five nearest homes?
from sklearn.neighbors import NearestNeighbors

def get_xy_from_geometry(df):
    """
    Return a numpy array with two columns, where the 
    first holds the `x` geometry coordinate and the second 
    column holds the `y` geometry coordinate
    
    Note: this works with both Point() and Polygon() objects.
    """
    # NEW: use the centroid.x and centroid.y to support Polygon() and Point() geometries 
    x = df.geometry.centroid.x
    y = df.geometry.centroid.y
    
    return np.column_stack((x, y)) # stack as columns


# get x/y values of houses
# first, convert CRS
home_sales = home_sales.to_crs("EPSG:3358") # Initially I used EPSG:2264, but this resulted in some inf geometries. Seems to work better with this CRS.
salesXY = get_xy_from_geometry(home_sales)

N = 5
k = N + 1
nbrs = NearestNeighbors(n_neighbors=k)
nbrs.fit(salesXY)

salesDists, salesIndices = nbrs.kneighbors(salesXY)

#use indices to ID which homes are the 5 neighbors of the original data
salesIndices[:, 1:]

# Take the sales proces from original data frame
all_sale_prices = home_sales["price"].values

#get the sales prices for the 5 nearest neighbors (ignoring first match)
neighboring_prices = all_sale_prices[salesIndices[:, 1:]]

# Add neighboring prices to our data set.
home_sales["NN_price"] = neighboring_prices.mean(axis = 1)
home_sales = home_sales.to_crs("4326")
home_sales.head()


# Distance to Waterfront

raw_lakes = gpd.read_file("data/lakes_ponds/Lakes_Ponds.shp").to_crs("4326")  #"EPSG:3358"
#raw_lakes.head()

# Figure out which lakes we want
#raw_lakes['feat_type'].unique()

wanted_lakes = ["Lake Norman", "Mountain Island Lake", "Catawba River / Lake Wylie"]

lakes = raw_lakes[raw_lakes['feat_type'].isin(wanted_lakes)]

lakes.head()
lakes['dummy_col'] = 1
lakes.head()

# Dissolve all the lakes into just one body of water
lakes = lakes.dissolve('dummy_col').reset_index()

lakes.head()

# Measure the distance from the lake to each home
# First, convert the lakes into a geometry
lakes = lakes.to_crs("EPSG:3358")

# Convert the homes into a geometry
home_sales = home_sales.to_crs("EPSG:3358")

# Calculate distance from homes to the lakes in feet
home_sales["Dist_to_Lakes"] = home_sales.apply(lambda x: x.geometry.distance(lakes.geometry[0]), axis = 1)

home_sales.to_crs("4326").head()

# Map lakes and homes distances to the lakes
home_sales = home_sales.to_crs("4326")
lakes = lakes.to_crs("4326")

# Map Home prices within Elementary schools' performance
# More red dots (higher priced homes) appear to be in areas of lighter colors (ie higher-performing elementary schools)
# while blue dots (lower priced homes) appear to be in areas of darker colors (ie lower-performing elementary schools).
# Therefore, it appears (though needs to be more thoroughly explored) that elementary performance is a good contender for predicting home values.

plt.style.use("seaborn-v0_8-whitegrid")

# Plot
fig, ax = plt.subplots(figsize = (10,10))


lakes.plot(ax = ax,
          column = "feat_type",
          legend = True,
          color = "blue")
          #cmap = "binary")

home_sales.plot(ax = ax,
          column = "Dist_to_Lakes",
          legend = True,
          cmap = "coolwarm",
          alpha = 0.1,
          s = 3)

ax.set_title("Homes' Distances (in feet) to Area Lakes, Mecklenburg, NC", fontsize = 16)
ax.set_axis_off()


# Correlation matrix
# Fill diagonal and upper half with NaNs
#rs = np.random.RandomState(0)
#df = pd.DataFrame(rs.rand(10, 10))
corr = home_sales.corr(numeric_only=True)
#corr.background_gradient(cmap='coolwarm')
mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True
corr[mask] = np.nan
(corr
 .style
 .background_gradient(cmap='coolwarm', axis=None, vmin=-1, vmax=1)
 .highlight_null(color='#f1f1f1')  # Color NaNs grey
 .format(precision=2))


# Select numeric columns
num_cols = [
    'yearbuilt',
    'heatedarea', 
    'storyheigh', 
    'fullbaths', 
    #'halfbaths', 
    'bedrooms', 
    'sale_year', 
    'NN_price', 
    #'Dist_to_Lakes'
    ]

# Select categorical columns
cat_cols = [
    #'municipali', 
    #"has_ac", 
    #"fireplace_cat", 
    "building_state", 
    "ELEM_Performance"
    ]

# Initialize the OHE transformer
ohe = OneHotEncoder()

# Set up the column transformer with two transformers
transformer = ColumnTransformer(
    transformers=[
        # Scale the numerical columns
        ("num", StandardScaler(), num_cols),
        # One-hot encode the categorical columns
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)


# Set up a training and test set (60/40)
train_set, test_set = train_test_split(home_sales, test_size=0.4, random_state=42)

# The target labels: sale price
y_train = train_set["price"]
y_test = test_set["price"]

#Set up a pipeline that includes both numerical columns and categorical columns
#total_cols = len(num_cols) + len(cat_cols)
pipe = make_pipeline(transformer, LinearRegression(fit_intercept = False))

# Fit on training data
pipe.fit(train_set, y_train)

# Retrive test score
pipe.score(test_set, y_test)

0.792540392046871


# Report R^2

from sklearn.model_selection import cross_val_score

scores = cross_val_score(
    pipe,
    test_set,
    y_test,
    cv=20,
)

# Report
print("R^2 scores = ", scores)
print("Scores mean = ", scores.mean())
print("Score std dev = ", scores.std())

R^2 scores =  [0.81912491 0.74021944 0.76839957 0.80955719 0.79258616 0.76012419
 0.81172916 0.79503181 0.76327889 0.74353617 0.81005301 0.82127631
 0.78165779 0.80253127 0.81769466 0.80888472 0.79425987 0.81015765
 0.77641603 0.79948575]
Scores mean =  0.7913002290336475
Score std dev =  0.024331884327164207


# First, define a function to calculate the mean absolute percent error
def evaluate_mape(model, X_test, y_test):
    """
    Given a model and test features/targets, print out the 
    mean absolute error and accuracy
    """
    # Make the predictions
    predictions = model.predict(X_test)

    # Absolute error
    errors = abs(predictions - y_test)
    avg_error = np.mean(errors)

    # Mean absolute percentage error
    mape = 100 * np.mean(errors / y_test)

    # Accuracy
    accuracy = 100 - mape

    print("Model Performance")
    print(f"Average Absolute Error: {avg_error:0.4f}")
    print(f"Accuracy = {accuracy:0.2f}%.")

    return accuracy

# Fit on train set
pipe.fit(train_set,
    y_train)

# Evaluate on test set
evaluate_mape(pipe, 
              test_set,
              y_test)

Model Performance
Average Absolute Error: 74427.8238
Accuracy = 79.90%.

79.89531200479136


regression_results = smf.ols('''price ~ 
yearbuilt + 
heatedarea + 
sale_year + 
NN_price + 
ELEM_Performance
''',
data = home_sales).fit()

print(regression_results.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  price   R-squared:                       0.786
Model:                            OLS   Adj. R-squared:                  0.786
Method:                 Least Squares   F-statistic:                 1.760e+04
Date:                Thu, 12 Oct 2023   Prob (F-statistic):               0.00
Time:                        17:59:39   Log-Likelihood:            -5.6605e+05
No. Observations:               43101   AIC:                         1.132e+06
Df Residuals:                   43091   BIC:                         1.132e+06
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
=========================================================================================
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
Intercept              6.595e+05   5.68e+04     11.607      0.000    5.48e+05    7.71e+05
sale_year[T.2021]      6.317e+04   1352.676     46.702      0.000    6.05e+04    6.58e+04
sale_year[T.2022]      1.331e+05   1562.712     85.150      0.000     1.3e+05    1.36e+05
ELEM_Performance[T.B]  1.815e+04   3821.543      4.750      0.000    1.07e+04    2.56e+04
ELEM_Performance[T.C]  1.548e+04   3799.524      4.074      0.000    8033.298    2.29e+04
ELEM_Performance[T.D]  2.371e+04   3851.365      6.156      0.000    1.62e+04    3.13e+04
ELEM_Performance[T.F]  3.793e+04   4126.165      9.193      0.000    2.98e+04     4.6e+04
yearbuilt              -417.7556     28.596    -14.609      0.000    -473.803    -361.708
heatedarea               91.0720      0.864    105.427      0.000      89.379      92.765
NN_price                  0.7387      0.004    202.924      0.000       0.732       0.746
==============================================================================
Omnibus:                    17897.533   Durbin-Watson:                   1.776
Prob(Omnibus):                  0.000   Jarque-Bera (JB):           356358.398
Skew:                           1.509   Prob(JB):                         0.00
Kurtosis:                      16.759   Cond. No.                     4.74e+07
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.74e+07. This might indicate that there are
strong multicollinearity or other numerical problems.


test_set_homes = home_sales.loc[test_set.index]

test_set_homes.head()

predictions = pipe.predict(test_set_homes)
actual = test_set_homes["price"]

mape_per_row = abs((predictions - actual) / actual) * 100

test_set_homes["MAPE"] = mape_per_row
test_set_homes["Prediction"] = predictions
test_set_homes["MAE"] = abs(predictions - actual)


# Scatterplot of homes' actual sale prices against their predicted sale price

def make_diagonal(x, y):   # uses the data for each plot to set the start end end points of the diagonal of each data set
    xx, yy = [min(x), max(x)], [min(y), max(y)]
    plt.plot(xx, yy,
    color = "black")

plt.style.use("seaborn-v0_8-whitegrid")

fig = plt.figure(figsize = (10,10))

plt.scatter(x = test_set_homes['price'],
            y = test_set_homes['Prediction'],
            alpha = 0.3)

make_diagonal(test_set_homes['price'], test_set_homes['price'])

plt.xlabel("Home actual sale price")
plt.ylabel("Home predicted sale price")
plt.title("Predicted vs Actual Home Sale Prices, Mecklenburg, NC")
plt.show()


# narrow in on errors

threshold = test_set_homes['MAPE'].mean()

bigger_errors = test_set_homes.loc[test_set_homes["MAPE"] > threshold]

plt.style.use("seaborn-v0_8-whitegrid")

# Map errors
fig, ax = plt.subplots(figsize = (10,10))


bigger_errors.plot(ax = ax,
          column = "MAPE",
          legend = True,
          cmap = "coolwarm",
          alpha = 0.1)
          #s = 3)

ax.set_title("Home Price Prediction MAPE, Mecklenburg, NC", fontsize = 16)
ax.set_axis_off()


# Moran's I
y = home_sales['price'].values
w = Queen.from_dataframe(home_sales)
w.transform = 'r' # transform rows to be stadardized

moran = Moran(y, w)
moran.I

0.7072866922138921


# Calculate Moran's I for our prediction errors

test_set_homes["error"] = predictions - actual

y_errors = test_set_homes['error'].values
w_errors = Queen.from_dataframe(test_set_homes)
w_errors.transform = 'r'

moran_errors = Moran(y_errors, w_errors)
moran_errors.I

0.08544212800412855

	municipali	dateofsale	price	yearbuilt	heatedarea	cdebuildin	descbuildi	storyheigh	aheatingty	heatedfuel	...	bldggrade	fullbaths	halfbaths	bedrooms	units	shape_Leng	shape_Area	sale_year	toPredict	geometry
0	HUNTERSVILLE	2021-07-01	375000.0	1976	1596.0	01	RES	1 STORY	AIR-DUCTED	GAS	...	AVERAGE	2	0	3.0	1	856.759354	36157.191937	2021.0	MODELLING	POINT (-80.93759 35.43227)
1	HUNTERSVILLE	2021-08-13	675000.0	2010	3244.0	01	RES	2.0 STORY	AIR-DUCTED	GAS	...	VERY GOOD	3	1	4.0	1	736.049633	26070.156589	2021.0	MODELLING	POINT (-80.93604 35.43186)
2	HUNTERSVILLE	2021-10-25	3793500.0	2008	5843.0	01	RES	2.0 STORY	AIR-DUCTED	GAS	...	EXCELLENT	3	1	4.0	1	1133.483555	41517.942335	2021.0	MODELLING	POINT (-80.93491 35.43386)
3	HUNTERSVILLE	2020-04-29	1125000.0	1991	4272.0	01	RES	1 STORY	AIR-DUCTED	GAS	...	VERY GOOD	4	0	4.0	1	908.459599	38364.218018	2020.0	MODELLING	POINT (-80.93474 35.43432)
4	HUNTERSVILLE	2020-02-18	915000.0	2014	3542.0	01	RES	2.0 STORY	AIR-DUCTED	GAS	...	VERY GOOD	4	1	4.0	1	1074.433466	58523.332015	2020.0	MODELLING	POINT (-80.93803 35.43247)

	municipali	dateofsale	price	yearbuilt	heatedarea	cdebuildin	descbuildi	storyheigh	aheatingty	heatedfuel	...	bldggrade	fullbaths	halfbaths	bedrooms	units	shape_Leng	shape_Area	sale_year	toPredict	geometry
0	HUNTERSVILLE	2021-07-01	375000.0	1976	1596.0	01	RES	1.0	AIR-DUCTED	GAS	...	AVERAGE	2	0	3.0	1	856.759354	36157.191937	2021.0	MODELLING	POINT (-80.93759 35.43227)
1	HUNTERSVILLE	2021-08-13	675000.0	2010	3244.0	01	RES	2.0	AIR-DUCTED	GAS	...	VERY GOOD	3	1	4.0	1	736.049633	26070.156589	2021.0	MODELLING	POINT (-80.93604 35.43186)
2	HUNTERSVILLE	2021-10-25	3793500.0	2008	5843.0	01	RES	2.0	AIR-DUCTED	GAS	...	EXCELLENT	3	1	4.0	1	1133.483555	41517.942335	2021.0	MODELLING	POINT (-80.93491 35.43386)
3	HUNTERSVILLE	2020-04-29	1125000.0	1991	4272.0	01	RES	1.0	AIR-DUCTED	GAS	...	VERY GOOD	4	0	4.0	1	908.459599	38364.218018	2020.0	MODELLING	POINT (-80.93474 35.43432)
4	HUNTERSVILLE	2020-02-18	915000.0	2014	3542.0	01	RES	2.0	AIR-DUCTED	GAS	...	VERY GOOD	4	1	4.0	1	1074.433466	58523.332015	2020.0	MODELLING	POINT (-80.93803 35.43247)

	municipali	dateofsale	price	yearbuilt	heatedarea	storyheigh	has_ac	fireplace_cat	building_state	fullbaths	halfbaths	bedrooms	sale_year	shape_Area	geometry
0	HUNTERSVILLE	2021-07-01	375,000.00	1976	1,596.00	1.00	has_AC	one_fireplace	above_average	2	0	3.00	2021	36,157.19	POINT (-80.93759 35.43227)
1	HUNTERSVILLE	2021-08-13	675,000.00	2010	3,244.00	2.00	has_AC	one_fireplace	above_average	3	1	4.00	2021	26,070.16	POINT (-80.93604 35.43186)
3	HUNTERSVILLE	2020-04-29	1,125,000.00	1991	4,272.00	1.00	has_AC	one_fireplace	above_average	4	0	4.00	2020	38,364.22	POINT (-80.93474 35.43432)
4	HUNTERSVILLE	2020-02-18	915,000.00	2014	3,542.00	2.00	has_AC	one_fireplace	above_average	4	1	4.00	2020	58,523.33	POINT (-80.93803 35.43247)
5	HUNTERSVILLE	2020-08-21	880,000.00	1992	3,520.00	1.00	has_AC	one_fireplace	above_average	3	0	3.00	2020	24,435.14	POINT (-80.93793 35.43427)

	municipali	dateofsale	price	yearbuilt	heatedarea	storyheigh	has_ac	fireplace_cat	building_state	fullbaths	...	sale_year	shape_Area	geometry	index_right	ELEM_NAME	ELEM_Performance	p_sim	sig	labels	NN_price
0	HUNTERSVILLE	2021-07-01	375,000.00	1976	1,596.00	1.00	has_AC	one_fireplace	above_average	2	...	2021	36,157.19	POINT (-80.93759 35.43227)	11	Barnette	B	0.00	1	LH	773,000.00
1	HUNTERSVILLE	2021-08-13	675,000.00	2010	3,244.00	2.00	has_AC	one_fireplace	above_average	3	...	2021	26,070.16	POINT (-80.93604 35.43186)	11	Barnette	B	0.01	1	HH	893,000.00
3	HUNTERSVILLE	2020-04-29	1,125,000.00	1991	4,272.00	1.00	has_AC	one_fireplace	above_average	4	...	2020	38,364.22	POINT (-80.93474 35.43432)	11	Barnette	B	0.00	1	HH	939,000.00
4	HUNTERSVILLE	2020-02-18	915,000.00	2014	3,542.00	2.00	has_AC	one_fireplace	above_average	4	...	2020	58,523.33	POINT (-80.93803 35.43247)	11	Barnette	B	0.04	1	HH	665,000.00
5	HUNTERSVILLE	2020-08-21	880,000.00	1992	3,520.00	1.00	has_AC	one_fireplace	above_average	3	...	2020	24,435.14	POINT (-80.93793 35.43427)	11	Barnette	B	0.01	1	HH	848,000.00

Predicting House Prices in Mecklenburg County, NC¶

Data Cleaning¶

Clean Building Codes and Building descriptions¶

Adjust building storey heights¶

Filter home sales by price¶

Clean Year Built, Heated Area, and Heating Types¶

Remove missing (NA) values¶

Cleaning Columns with Numerical Variables¶

Finalized cleaned columns¶

Mapping Homes by Price¶

Feature Engineering¶

Elementary Schools¶

The Spatial Lag of House Prices¶

A statistical test to understand the spatial process¶

Local Statistics (Top-left)¶

Scatterplot Quadrant (Top-right)¶

Statistical Significance Map Three (Lower-left)¶

Moran Cluster Map (Lower-right)¶

Proximity to Waterfront¶

Correlation Plot¶

Model Building¶

Map errors¶

	price	yearbuilt	heatedarea	storyheigh	fullbaths	halfbaths	bedrooms	shape_Area	index_right	p_sim	sig	NN_price	Dist_to_Lakes
price	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan
yearbuilt	0.02	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan
heatedarea	0.68	0.35	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan
storyheigh	0.27	0.50	0.56	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan
fullbaths	0.57	0.40	0.77	0.45	nan	nan	nan	nan	nan	nan	nan	nan	nan
halfbaths	0.19	0.27	0.34	0.57	0.06	nan	nan	nan	nan	nan	nan	nan	nan
bedrooms	0.42	0.34	0.71	0.52	0.68	0.24	nan	nan	nan	nan	nan	nan	nan
shape_Area	0.16	-0.16	0.13	-0.07	0.06	-0.02	0.03	nan	nan	nan	nan	nan	nan
index_right	-0.09	-0.06	-0.10	-0.03	-0.10	-0.01	-0.05	-0.01	nan	nan	nan	nan	nan
p_sim	-0.08	0.19	0.11	0.14	0.06	0.05	0.12	-0.01	-0.05	nan	nan	nan	nan
sig	0.12	-0.20	-0.09	-0.16	-0.04	-0.07	-0.10	0.02	-0.00	-0.71	nan	nan	nan
NN_price	0.83	-0.08	0.59	0.21	0.48	0.14	0.35	0.10	-0.11	-0.09	0.14	nan	nan
Dist_to_Lakes	0.01	-0.13	-0.05	-0.07	-0.05	-0.02	-0.04	0.05	-0.06	-0.05	0.04	0.02	nan