import plainEnglishCoding
import pprint,sympy
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import matplotlib.pyplot as plt
import matplotlib,sympy
import ipywidgets as widgets

# Create our system of equations.
# Need to run slightly different code to accommodate the different prices among individuals.
p = np.array([
    [2,  3,],
    [2.2,3,],
])
M = np.array([
    [2,3,],
    [3,5],
])
c = np.sum(M*p,axis=1)
c

array([13. , 21.6])

# attempt to backsolve
p2 = np.linalg.solve(M,c)
p2

array([0.2, 4.2])

# is this alternative answer 'correct'?
print(c)
M@p2

[13.  21.6]

array([13. , 21.6])

# programmatically, you can take the transpose of a matrix in Numpy or SymPy using M.T
M = np.reshape(np.arange(6)+1,(3,2))
print(M)
print("")
# transpose M
print(M.T)

[[1 2]
 [3 4]
 [5 6]]

[[1 3 5]
 [2 4 6]]

a,b,c,d,e,f = sympy.symbols("m_{11} m_{12} m_{21} m_{22} m_{31} m_{32}")
M = sympy.Matrix([
    [a,b],
    [c,d],
    [e,f]
])
M

# transpose M
M.T

M.T@M

# Python Numpy for solving system with matrix tranpose

# generate example prices \vec{x}
x = np.random.randint(0,20,size=5)
p = len(x)
# should be n > p
n = p+20
# generate example purchasing behavior for n individuals
M = np.random.randint(0,50,size=(n,p))
# calculate true total costs \vec{y}
y = M@x

# attempt to backsolve, but expect error because M is not square
x = np.linalg.solve(M,y)

---------------------------------------------------------------------------
LinAlgError                               Traceback (most recent call last)
/var/folders/1_/mlqp4_wx2_54k9cswn2_f0qr0000gn/T/ipykernel_92453/588746993.py in <module>
      1 # attempt to backsolve, but expect error because M is not square
----> 2 x = np.linalg.solve(M,y)

/opt/miniconda3/lib/python3.9/site-packages/numpy/linalg/linalg.py in solve(a, b)
    394     a, _ = _makearray(a)
    395     _assert_stacked_2d(a)
--> 396     _assert_stacked_square(a)
    397     b, wrap = _makearray(b)
    398     t, result_t = _commonType(a, b)

/opt/miniconda3/lib/python3.9/site-packages/numpy/linalg/linalg.py in _assert_stacked_square(*arrays)
    211         m, n = a.shape[-2:]
    212         if m != n:
--> 213             raise LinAlgError('Last 2 dimensions of the array must be square')
    214 
    215 def _assert_finite(*arrays):

LinAlgError: Last 2 dimensions of the array must be square

# instead, using M.T
y2 = M.T@y
# estimate prices 
xhat = np.linalg.solve(M.T@M,y2)
# simulate total costs with estimated prices
yhat = M@xhat

# plot comparison of actual and estimated prices
plt.plot(x,xhat,'o',zorder=4,label="_data")
xx,yy = plt.xlim(),plt.ylim()
plt.plot(xx,xx,zorder=3,label="$x_i=\\hat{x}_i$",linewidth=3)
plt.legend(fontsize=15)
plt.xlim(xx)
plt.ylim(yy)
plt.xlabel("Actual Price ($x_i$)",fontsize=15)
plt.ylabel("Estimated Price ($\\hat{x}_i$)",fontsize=15);

# plot comparison of actual and estimated total costs
plt.plot(y,yhat,'o',zorder=4,label="_data")
xx,yy = plt.xlim(),plt.ylim()
plt.plot(xx,xx,zorder=3,label="$y_i=\\hat{y}_i$",linewidth=3)
plt.legend(fontsize=15)
plt.xlim(xx)
plt.ylim(yy)
plt.xlabel("Actual Total Cost ($y_i$)",fontsize=15)
plt.ylabel("Estimated Total Cost ($\\hat{y}_i$)",fontsize=15);

# Another option is to use the sklearn Python package
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

from sklearn.linear_model import LinearRegression

# Create a Linear Regression object
ols = LinearRegression(fit_intercept=False)
# fit the linear regression to our data
ols.fit(M,y)

# get estimated costs
xhat = np.round(ols.coef_,decimals=10)
# get estimated total costs
yhat = ols.predict(M)

# plot comparison of actual and estimated prices
plt.plot(x,xhat,'o',zorder=4,label="_data")
xx,yy = plt.xlim(),plt.ylim()
plt.plot(xx,xx,zorder=3,label="$x_i=\\hat{x}_i$",linewidth=3)
plt.legend(fontsize=15)
plt.xlim(xx)
plt.ylim(yy)
plt.xlabel("Actual Price ($x_i$)",fontsize=15)
plt.ylabel("Estimated Price ($\\hat{x}_i$)",fontsize=15);

# plot comparison of actual and estimated total costs
plt.plot(y,yhat,'o',zorder=4,label="_data")
xx,yy = plt.xlim(),plt.ylim()
plt.plot(xx,xx,zorder=3,label="$y_i=\\hat{y}_i$",linewidth=3)
plt.legend(fontsize=15)
plt.xlim(xx)
plt.ylim(yy)
plt.xlabel("Actual Total Cost ($y_i$)",fontsize=15)
plt.ylabel("Estimated Total Cost ($\\hat{y}_i$)",fontsize=15);

from scipy.stats import pearsonr
# generate true data
x = np.array([1,4,20,5,3,6,3,7,2])
# number of grocery item prices (one price per item)
p = len(x)
# number of individuals with purchasing data (here just want n > p)
n = 50*p
x = np.tile(x,(n,1))
M = np.random.randint(0,15,(n,p))
# precalculate stuff for widget
MT = M.T
G = MT@M
y = np.sum(M*x,axis=1)
# # generate noise to be added to the prices (x)
# # Here, the noise represents variation in prices for the same grocery item across different stores.
# # Prices are normally distributed about some average price.
noise = np.random.normal(size=x.shape)

def plotResiduals(priceVariation=0):
    noisyx = x+priceVariation*noise
    ynoisy = np.sum(M*noisyx,axis=1)
    y2 = MT@ynoisy
    xhat = np.linalg.solve(G,y2)
    yhat = M@xhat
    
    # plot outcomes
#     plt.figure(figsize=(12,6));
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2,figsize=(14,8));
    n2 = np.min([10,n])
    for i in range(n2):
        ax1.plot([i+1,i+1],[y[i],yhat[i]],'k-',label="_residual")
    
    a = np.arange(n2)+1
    ax1.plot(a,y[:n2],'o',label="Actual Cost ($y_i$)",markersize=8)
    ax1.plot(a,yhat[:n2],'^',label="Estimated Cost ($\\hat{y}_i$)",markersize=8)
    ax1.set_ylim((-500,1000))
    xx,yy = ax1.get_xlim(),ax1.get_ylim()
    ax1.plot([0,0],[1,2],"k-",label="Residual",linewidth=3)
    ax1.set_xlim(xx)
    ax1.set_ylim(yy)
    ax1.legend(fontsize=15,ncol=2,loc="lower left")
    ax1.set_xticks(list(map(int,a)))
    ax1.set_xlabel("Individual ($i$)",fontsize=15)
    ax1.set_yticklabels(ax1.get_yticks(),fontsize=15)
    ax1.set_ylabel("Total Cost ($)",fontsize=15)

    
    ax2.plot(y,yhat,'o',zorder=4,label="_data")
    r,p = pearsonr(y,yhat)
    xx,yy = ax2.get_xlim(),ax2.get_ylim()
    ax2.plot(xx,xx,'-',zorder=3,label="$\\hat{y}_i = y_i$",linewidth=3)
    ax2.set_title("$R^2=%0.2f$" % (r**2),fontsize=15)
    ax2.set_xlim(xx)
    ax2.set_ylim(yy)
    ax2.set_xlabel("Actual Total Cost ($y_i$)",fontsize=15)
    ax2.set_ylabel("Estimated Total Cost ($\\hat{y}_i$)",fontsize=15)
    ax2.legend(fontsize=12)
    
    C = ['r','b','g']
    for i in range(3):
        counts,bins = np.histogram(noisyx[:,i],bins=20)
        ax3.plot(bins[:-1],counts,'-o',label="grocery %d" % (i+1),color=C[i])
    yy = ax3.get_ylim()[1]
    for i in range(3):
        ax3.plot([xhat[i],xhat[i]],[0,yy],'--',color=C[i],label="_xhat",linewidth=3)
    ax3.plot([0,0],[-1,-2],'k--',label="Estmated Cost ($\\hat{x}_i$)")
    ax3.set_ylim((0,yy))
    ax3.legend(fontsize=12)
    ax3.set_xlabel("Observed Grocery Item Price ($)",fontsize=15)
    ax3.set_ylabel("Number of Observations",fontsize=15)
    ax3.set_yticklabels(ax2.get_yticks(),fontsize=15)
    
    ax4.plot(x[0,:],xhat,'o',zorder=4,label="_data")
    xx,yy = ax4.get_xlim(),ax4.get_ylim()
    ax4.plot(xx,xx,'-',zorder=3,label="$\\hat{x}_i = x_i$",linewidth=3)
    ax4.set_xlim(xx)
    ax4.set_ylim(yy)
    ax4.set_xlabel("Actual Grocery Price ($x_i$)",fontsize=15)
    ax4.set_ylabel("Estimated Grocery Price ($\\hat{x}_i$)",fontsize=15)
    ax4.legend(fontsize=12)
    
    
    plt.tight_layout();
    return None

widgets.interact(plotResiduals,priceVariation=(0,50,1));

interactive(children=(IntSlider(value=0, description='priceVariation', max=50), Output()), _dom_classes=('widg…

x = np.arange(5)+1
y = 3+4*x
d = sympy.Matrix(np.vstack((x,y)).T)
print(sympy.latex(d))

\left[\begin{matrix}1 & 7\\2 & 11\\3 & 15\\4 & 19\\5 & 23\end{matrix}\right]

plt.plot(x,y,'o')
plt.xlabel("x",fontsize=15)
plt.ylabel("y",fontsize=15);
plt.title("What does the data look like?",fontsize=15);

# our data
x = np.arange(5)+1
y = np.array([7,11,15,19,23])

# create the matrix containing the data
M = np.vstack((x,np.ones(len(x)))).T
M

array([[1., 1.],
       [2., 1.],
       [3., 1.],
       [4., 1.],
       [5., 1.]])

# use the transpose of M to solve the system of linear equations
G = M.T@M
yy = M.T@y
b = np.linalg.solve(G,yy)
b

array([4., 3.])

# Let's test our solution
yhat = M@b
print(yhat)
print(y)

[ 7. 11. 15. 19. 23.]
[ 7 11 15 19 23]

x2 = np.arange(10)+1
n2 = len(x2)
e2 = 5
y2 = 3 + 4*x2 +e2*np.random.normal(size=n2)

plt.plot(x2,y2,'o')
plt.xlabel("x",fontsize=15)
plt.ylabel("y",fontsize=15);
plt.title("Does the data look like a line?",fontsize=15);

def lineWidget(b0=1,b1=1):
    x,y = x2,y2
    plt.plot(x,y,'o',label="Data",zorder=4)
    plt.xlabel("x",fontsize=15)
    plt.ylabel("y",fontsize=15);
    xx = np.array(plt.xlim())
    yy = plt.ylim()
    z = b0 + b1*x
#     R = pearsonr(y,z)[0]**2
    R = np.sum(np.abs(y-z))
    plt.title("Try to minimize $|y-\\hat{y}|=%0.1f$" % R,fontsize=15)
    for i in range(len(x)):
        plt.plot([x[i],x[i]],[y[i],z[i]],'k-',zorder=3,label="_residual")
    zz = b0 + b1*xx
    plt.plot(xx,zz,'r-',label="$\\beta_0+\\beta_1 x$",zorder=5)
    plt.plot([-10,-5],[0,0],'k-',label="Residual")
    plt.xlim(xx)
    plt.ylim(yy)
    plt.legend(loc="lower right",fontsize=12)
    plt.xlabel("x",fontsize=15)
    plt.ylabel("y",fontsize=15)
    
    
widgets.interact(lineWidget,b0=(0,5,.2),b1=(0,5,.2));

interactive(children=(FloatSlider(value=1.0, description='b0', max=5.0, step=0.2), FloatSlider(value=1.0, desc…

def lineWidget(b0=1,b1=1):
    x,y = x2,y2
    plt.plot(x,y,'o',label="Data",zorder=4)
    plt.xlabel("x",fontsize=15)
    plt.ylabel("y",fontsize=15);
    xx = np.array(plt.xlim())
    yy = plt.ylim()
    z = b0 + b1*x
#     R = pearsonr(y,z)[0]**2
    R = np.sum(np.abs(y-z))
    plt.title("Try to minimize $|y-\\hat{y}|=%0.1f$" % R,fontsize=15)
    for i in range(len(x)):
        plt.plot([x[i],x[i]],[y[i],z[i]],'k-',zorder=3,label="_residual")
    zz = b0 + b1*xx
    plt.plot(xx,zz,'r-',label="$\\beta_0+\\beta_1 x$",zorder=5)
    plt.plot([-10,-5],[0,0],'k-',label="Residual")
    plt.xlim(xx)
    plt.ylim(yy)
    plt.legend(loc="lower right",fontsize=12)
    plt.xlabel("x",fontsize=15)
    plt.ylabel("y",fontsize=15)
    
    
widgets.interact(lineWidget,b0=(0,5,.2),b1=(0,5,.2));

interactive(children=(FloatSlider(value=1.0, description='b0', max=5.0, step=0.2), FloatSlider(value=1.0, desc…

x3 = np.arange(15)
y3 = 2*x3**2 + 3*x3 + 1

def quadWidget(b0=1,b1=1,b2=1):
    x2,y2 = x3,y3
    plt.plot(x2,y2,'o',label="Data",zorder=4)
    plt.xlabel("x",fontsize=15)
    plt.ylabel("y",fontsize=15);
    xx = np.array(plt.xlim())
    yy = plt.ylim()
    z = b0 + b1*x2 + b2*x2**2
#     R = pearsonr(y,z)[0]**2
    R = np.sum(np.abs(y2-z))
    plt.title("Try to minimize $|y-\\hat{y}|=%0.1f$" % R,fontsize=15)
    for i in range(len(x)):
        plt.plot([x2[i],x2[i]],[y2[i],z[i]],'k-',zorder=3,label="_residual")
    plt.plot(x2,z,'r-',label="$\\beta_0+\\beta_1 x + \\beta_2 x^2$",zorder=5)
    plt.plot([-10,-5],[0,0],'k-',label="Residual")
    plt.xlim(xx)
    plt.ylim(yy)
    plt.legend(loc="lower right",fontsize=12)
    plt.xlabel("x",fontsize=15)
    plt.ylabel("y",fontsize=15)
    
    
widgets.interact(quadWidget,b0=(0,5,.2),b1=(0,5,.2),b2=(0,5,.2));

interactive(children=(FloatSlider(value=1.0, description='b0', max=5.0, step=0.2), FloatSlider(value=1.0, desc…

widgets.interact(quadWidget,b0=(0,5,.2),b1=(0,5,.2),b2=(0,5,.2));

interactive(children=(FloatSlider(value=1.0, description='b0', max=5.0, step=0.2), FloatSlider(value=1.0, desc…

# import that Pandas Python package for working with dataframes
import pandas as pd

# read in housing price data as a dataframe
data = pd.read_csv("./kaggleHousingPrices/train.csv")
data.head()

data.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',
       'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',
       'SaleCondition', 'SalePrice'],
      dtype='object')

# summary statistics of each column
data.describe()

# Access specific column
# Each column is like a Numpy array 
# and can be thought of as a vector.
data["LotArea"]

0        8450
1        9600
2       11250
3        9550
4       14260
        ...  
1455     7917
1456    13175
1457     9042
1458     9717
1459     9937
Name: LotArea, Length: 1460, dtype: int64

# visualize a column
data["LotArea"].hist()

<AxesSubplot:>

# dataframes are matrices underneath
M = data.values
print(M.shape)
M

(1460, 81)

array([[1, 60, 'RL', ..., 'WD', 'Normal', 208500],
       [2, 20, 'RL', ..., 'WD', 'Normal', 181500],
       [3, 60, 'RL', ..., 'WD', 'Normal', 223500],
       ...,
       [1458, 70, 'RL', ..., 'WD', 'Normal', 266500],
       [1459, 20, 'RL', ..., 'WD', 'Normal', 142125],
       [1460, 20, 'RL', ..., 'WD', 'Normal', 147500]], dtype=object)

# Dataframe columns can contain any data type.
# Filter dataframe to numerical columns of interest.
data = pd.read_csv("./kaggleHousingPrices/train.csv")
data = data[["SalePrice","LotArea","GarageArea","PoolArea","GrLivArea","BedroomAbvGr"]]
data.head()

# get matrix M and sale price vector y
y = data["SalePrice"]
M = data[["LotArea","GarageArea","PoolArea","BedroomAbvGr"]]
n,p = M.shape
# number of observations (home sales), number of home features 
print((n,p))
M.describe()

(1460, 4)

# attempt to solve for feature values, but expect error because M is not square
xhat = np.linalg.solve(M,y)

---------------------------------------------------------------------------
LinAlgError                               Traceback (most recent call last)
/var/folders/1_/mlqp4_wx2_54k9cswn2_f0qr0000gn/T/ipykernel_92453/2762337710.py in <module>
      1 # attempt to solve for feature values, but expect error because M is not square
----> 2 xhat = np.linalg.solve(M,y)

/opt/miniconda3/lib/python3.9/site-packages/numpy/linalg/linalg.py in solve(a, b)
    394     a, _ = _makearray(a)
    395     _assert_stacked_2d(a)
--> 396     _assert_stacked_square(a)
    397     b, wrap = _makearray(b)
    398     t, result_t = _commonType(a, b)

/opt/miniconda3/lib/python3.9/site-packages/numpy/linalg/linalg.py in _assert_stacked_square(*arrays)
    211         m, n = a.shape[-2:]
    212         if m != n:
--> 213             raise LinAlgError('Last 2 dimensions of the array must be square')
    214 
    215 def _assert_finite(*arrays):

LinAlgError: Last 2 dimensions of the array must be square

# use M transpose approach
xhat = np.linalg.solve(M.T@M,M.T@y)
yhat = M@xhat

# did we do a good job?
plt.figure(figsize=(8,6))
plt.plot(y,yhat,'o',alpha=.5,label="_data",zorder=3)
xx,yy = plt.xlim(),plt.ylim()
plt.plot(xx,xx,'--',zorder=4,label="$\\hat{y}_i = y_i$",linewidth=3)
plt.xticks(fontsize=15,rotation=35)
plt.yticks(fontsize=15)
plt.xlabel("Actual Sales Price ($y_i$)",fontsize=15)
plt.ylabel("Estimated Sales Price ($\hat{y}_i$)",fontsize=15)

ybar = y.mean()
R2 = np.sum((yhat-ybar)**2) / np.sum((y-ybar)**2)
plt.title("$R^2=%0.2f$" % R2,fontsize=15);
plt.legend(fontsize=15);

# summarize the amenity cost estimates as a Pandas Series object
xhat1_series = pd.Series(
    data = xhat,
    index = M.columns,
).sort_values()
xhat1_series

LotArea             1.190920
PoolArea           59.027950
GarageArea        236.762178
BedroomAbvGr    18931.625535
dtype: float64

data = pd.read_csv("./kaggleHousingPrices/train.csv")

# get matrix M and sale price vector y
y = data["SalePrice"]

# originally  M = data[["LotArea","GarageArea","PoolArea","BedroomAbvGr"]]
M = data[["LotArea","GarageArea","PoolArea","GrLivArea","BedroomAbvGr","OverallCond","OverallQual",]]
n,p = M.shape
# number of observations (home sales), number of home features 
print((n,p))

# use M transpose approach
xhat = np.linalg.solve(M.T@M,M.T@y)
yhat = M@xhat

(1460, 7)

# did we do a good job?
plt.figure(figsize=(8,6))
plt.plot(y,yhat,'o',alpha=.5,label="_data",zorder=3)
xx,yy = plt.xlim(),plt.ylim()
plt.plot(xx,xx,'--',zorder=4,label="$\\hat{y}_i = y_i$",linewidth=3)
plt.xlim(xx)
plt.ylim(yy)
plt.xticks(fontsize=15,rotation=35)
plt.yticks(fontsize=15)
plt.xlabel("Actual Sales Price ($y_i$)",fontsize=15)
plt.ylabel("Estimated Sales Price ($\hat{y}_i$)",fontsize=15)

ybar = y.mean()
R2 = np.sum((yhat-ybar)**2) / np.sum((y-ybar)**2)
plt.title("$R^2=%0.2f$" % R2,fontsize=15);
plt.legend(fontsize=15);

# summarize the amenity cost estimates as a Pandas Series object
pd.Series(
    data = xhat,
    index = M.columns,
).sort_values()

BedroomAbvGr   -17117.597699
OverallCond     -5463.293108
PoolArea          -18.799606
LotArea             0.681247
GarageArea         59.607438
GrLivArea          64.353832
OverallQual     21165.914665
dtype: float64

# result from before
xhat1_series

LotArea             1.190920
PoolArea           59.027950
GarageArea        236.762178
BedroomAbvGr    18931.625535
dtype: float64

	Id	MSSubClass	LotFrontage	LotArea	OverallQual	OverallCond	YearBuilt	YearRemodAdd	MasVnrArea	BsmtFinSF1	...	WoodDeckSF	OpenPorchSF	EnclosedPorch	3SsnPorch	ScreenPorch	PoolArea	MiscVal	MoSold	YrSold	SalePrice
count	1460.000000	1460.000000	1201.000000	1460.000000	1460.000000	1460.000000	1460.000000	1460.000000	1452.000000	1460.000000	...	1460.000000	1460.000000	1460.000000	1460.000000	1460.000000	1460.000000	1460.000000	1460.000000	1460.000000	1460.000000
mean	730.500000	56.897260	70.049958	10516.828082	6.099315	5.575342	1971.267808	1984.865753	103.685262	443.639726	...	94.244521	46.660274	21.954110	3.409589	15.060959	2.758904	43.489041	6.321918	2007.815753	180921.195890
std	421.610009	42.300571	24.284752	9981.264932	1.382997	1.112799	30.202904	20.645407	181.066207	456.098091	...	125.338794	66.256028	61.119149	29.317331	55.757415	40.177307	496.123024	2.703626	1.328095	79442.502883
min	1.000000	20.000000	21.000000	1300.000000	1.000000	1.000000	1872.000000	1950.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	2006.000000	34900.000000
25%	365.750000	20.000000	59.000000	7553.500000	5.000000	5.000000	1954.000000	1967.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	5.000000	2007.000000	129975.000000
50%	730.500000	50.000000	69.000000	9478.500000	6.000000	5.000000	1973.000000	1994.000000	0.000000	383.500000	...	0.000000	25.000000	0.000000	0.000000	0.000000	0.000000	0.000000	6.000000	2008.000000	163000.000000
75%	1095.250000	70.000000	80.000000	11601.500000	7.000000	6.000000	2000.000000	2004.000000	166.000000	712.250000	...	168.000000	68.000000	0.000000	0.000000	0.000000	0.000000	0.000000	8.000000	2009.000000	214000.000000
max	1460.000000	190.000000	313.000000	215245.000000	10.000000	9.000000	2010.000000	2010.000000	1600.000000	5644.000000	...	857.000000	547.000000	552.000000	508.000000	480.000000	738.000000	15500.000000	12.000000	2010.000000	755000.000000

	LotArea	GarageArea	PoolArea	BedroomAbvGr
count	1460.000000	1460.000000	1460.000000	1460.000000
mean	10516.828082	472.980137	2.758904	2.866438
std	9981.264932	213.804841	40.177307	0.815778
min	1300.000000	0.000000	0.000000	0.000000
25%	7553.500000	334.500000	0.000000	2.000000
50%	9478.500000	480.000000	0.000000	3.000000
75%	11601.500000	576.000000	0.000000	3.000000
max	215245.000000	1418.000000	738.000000	8.000000

Failure to Backsolve¶

Solving Overdetermined Linear Systems with Matrix Transpose¶

Solving Linear Systems Probabilistically with OLS¶

Fitting Linear Equations to Data¶

Regression Example: Home sales and amenities¶

	Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	...	PoolQC	Fence	MiscFeature	MoSold	YrSold	SaleType	SaleCondition	SalePrice
0	1	60	RL	65.0	8450	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	2	2008	WD	Normal	208500
1	2	20	RL	80.0	9600	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	5	2007	WD	Normal	181500
2	3	60	RL	68.0	11250	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	9	2008	WD	Normal	223500
3	4	70	RL	60.0	9550	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	2	2006	WD	Abnorml	140000
4	5	60	RL	84.0	14260	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	12	2008	WD	Normal	250000

	SalePrice	LotArea	GarageArea	GrLivArea	BedroomAbvGr
0	208500	8450	548	1710	3
1	181500	9600	460	1262	3
2	223500	11250	608	1786	3
3	140000	9550	642	1717	3
4	250000	14260	836	2198	4