import pandas as pd #pandas is a data analysis library for python
weather = pd.read_csv("weather.csv", index_col="DATE")
# pandas read csv function, reads in weather csv file. specifies that the first colomn (date colomn) is the index
weather
null_pct = weather.apply(pd.isnull).sum()/weather.shape[0]
null_pct
#finds the number of null values in each colomn
#then divides it by the total number of rows
valid_columns = weather.columns[null_pct < .05]
valid_columns
#these are the colomns with less than 5% missing values
weather = weather[valid_columns].copy()
#preserves only the above colomns in our data
#.copy() prevents us from getting a copy warning later
weather.columns = weather.columns.str.lower()
weather
weather = weather.ffill()
#for example if the last day had a snow depth of 0, then the next day would also probably have a snow depth of zero too
weather.apply(pd.isnull).sum()
#now we can see that all the missing values have been filled and we have 0 missing values
weather.dtypes
#everything is stored as the correct type here
#object data type usually indicates that the colomn is a string
weather.index
#we can see that our index is stored as a object
weather.index = pd.to_datetime(weather.index)
weather.index
#now we can see that our index is stored as a date time
weather.index.year
#this gives us the year componenet of our data, you can't easily do this with a string
weather.index.year.value_counts().sort_index()
#it sorts the index in order by year
#counts how many times each unique value occurs, sees how many records we have for each year
#should be either 365 or 366 (for leap years)
weather["snwd"].plot()
#creates car graph that shows snow depth by day
weather["target"] = weather.shift(-1)["tmax"]
#creating a target colomn in the weather data for predictions
#shift method keeps the same method but pulls value from the next row
weather
#for example Jan 1's target is Jan 2's Tmax
weather = weather.ffill()
#fill the vaule
weather
#one row having an incorrect target value will be insignificant with 20,000 rows in total
from sklearn.linear_model import Ridge
#apply ridge regression model
#"Ridge regression is a model tuning method that is used to analyse any data that suffers from multicollinearity"
#"Multicollinearity is a statistical concept where several independent variables in a model are correlated"
#ridge regression is similar to linear regression except that it penalizes coefficients to account for multicollinearity
rr = Ridge(alpha=.1)
#alpha parameter controls how much coefficeints are shrunk
weather.corr()
#checking for correlation between colomns
#can see that preceptiation is pretty uncorrelated from the rest of colomns
#but the rest are correlated to each other
predictors = weather.columns[~weather.columns.isin(["target", "name", "station"])]
#create a list of predictor colomns
#gives all the colomns in weather data except for these 3
# ~ negative operator looks for colomns not in the list
predictors
def backtest(weather, model, predictors, start=3650, step=90):
all_predictions = []
for i in range(start, weather.shape[0], step):
train = weather.iloc[:i,:]
test = weather.iloc[i:(i+step),:]
model.fit(train[predictors], train["target"])
preds = model.predict(test[predictors])
preds = pd.Series(preds, index=test.index)
combined = pd.concat([test["target"], preds], axis=1)
combined.columns = ["actual", "prediction"]
combined["diff"] = (combined["prediction"] - combined["actual"]).abs()
all_predictions.append(combined)
return pd.concat(all_predictions)
predictions = backtest(weather, rr, predictors)
from sklearn.metrics import mean_absolute_error, mean_squared_error
mean_absolute_error(predictions["actual"], predictions["prediction"])
predictions.sort_values("diff", ascending=False)