import pandas as pd    #pandas is a data analysis library for python

weather = pd.read_csv("weather.csv", index_col="DATE")   
# pandas read csv function, reads in weather csv file. specifies that the first colomn (date colomn) is the index 
weather
STATION NAME ACMH ACSH AWND FMTM PGTM PRCP SNOW SNWD ... WT11 WT13 WT14 WT15 WT16 WT17 WT18 WT21 WT22 WV01
DATE
1970-01-01 USW00094789 JFK INTERNATIONAL AIRPORT, NY US 80.0 90.0 NaN NaN NaN 0.00 0.0 0.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1970-01-02 USW00094789 JFK INTERNATIONAL AIRPORT, NY US 30.0 20.0 NaN NaN NaN 0.00 0.0 0.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1970-01-03 USW00094789 JFK INTERNATIONAL AIRPORT, NY US 80.0 100.0 NaN NaN NaN 0.02 0.0 0.0 ... NaN NaN NaN NaN 1.0 NaN 1.0 NaN NaN NaN
1970-01-04 USW00094789 JFK INTERNATIONAL AIRPORT, NY US 10.0 20.0 NaN NaN NaN 0.00 0.0 0.0 ... NaN NaN NaN NaN NaN NaN 1.0 NaN NaN NaN
1970-01-05 USW00094789 JFK INTERNATIONAL AIRPORT, NY US 30.0 10.0 NaN NaN NaN 0.00 0.0 0.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2022-10-17 USW00094789 JFK INTERNATIONAL AIRPORT, NY US NaN NaN 9.62 NaN NaN 0.08 0.0 0.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2022-10-18 USW00094789 JFK INTERNATIONAL AIRPORT, NY US NaN NaN 12.08 NaN NaN 0.00 0.0 0.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2022-10-19 USW00094789 JFK INTERNATIONAL AIRPORT, NY US NaN NaN 14.99 NaN NaN 0.00 0.0 0.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2022-10-20 USW00094789 JFK INTERNATIONAL AIRPORT, NY US NaN NaN 16.78 NaN 10.0 0.00 0.0 0.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2022-10-21 USW00094789 JFK INTERNATIONAL AIRPORT, NY US NaN NaN NaN NaN NaN 0.00 0.0 0.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

19287 rows × 44 columns

null_pct = weather.apply(pd.isnull).sum()/weather.shape[0]
null_pct
#finds the number of null values in each colomn 
#then divides it by the total number of rows 
STATION    0.000000
NAME       0.000000
ACMH       0.501478
ACSH       0.501426
AWND       0.265256
FMTM       0.475087
PGTM       0.363872
PRCP       0.000000
SNOW       0.000000
SNWD       0.000104
TAVG       0.680406
TMAX       0.000000
TMIN       0.000000
TSUN       0.998393
WDF1       0.501685
WDF2       0.498678
WDF5       0.502981
WDFG       0.734484
WDFM       0.999948
WESD       0.685228
WSF1       0.501530
WSF2       0.498678
WSF5       0.503033
WSFG       0.613055
WSFM       0.999948
WT01       0.630217
WT02       0.935034
WT03       0.933271
WT04       0.982579
WT05       0.981127
WT06       0.990615
WT07       0.994400
WT08       0.796962
WT09       0.992741
WT11       0.999274
WT13       0.886711
WT14       0.954010
WT15       0.997822
WT16       0.658993
WT17       0.996889
WT18       0.939493
WT21       0.999741
WT22       0.997459
WV01       0.999948
dtype: float64
valid_columns = weather.columns[null_pct < .05]
valid_columns
#these are the colomns with less than 5% missing values 
Index(['STATION', 'NAME', 'PRCP', 'SNOW', 'SNWD', 'TMAX', 'TMIN'], dtype='object')
weather = weather[valid_columns].copy()
#preserves only the above colomns in our data
#.copy() prevents us from getting a copy warning later
weather.columns = weather.columns.str.lower()
weather
station name prcp snow snwd tmax tmin
DATE
1970-01-01 USW00094789 JFK INTERNATIONAL AIRPORT, NY US 0.00 0.0 0.0 28 22
1970-01-02 USW00094789 JFK INTERNATIONAL AIRPORT, NY US 0.00 0.0 0.0 31 22
1970-01-03 USW00094789 JFK INTERNATIONAL AIRPORT, NY US 0.02 0.0 0.0 38 25
1970-01-04 USW00094789 JFK INTERNATIONAL AIRPORT, NY US 0.00 0.0 0.0 31 23
1970-01-05 USW00094789 JFK INTERNATIONAL AIRPORT, NY US 0.00 0.0 0.0 35 21
... ... ... ... ... ... ... ...
2022-10-17 USW00094789 JFK INTERNATIONAL AIRPORT, NY US 0.08 0.0 0.0 67 54
2022-10-18 USW00094789 JFK INTERNATIONAL AIRPORT, NY US 0.00 0.0 0.0 58 48
2022-10-19 USW00094789 JFK INTERNATIONAL AIRPORT, NY US 0.00 0.0 0.0 56 43
2022-10-20 USW00094789 JFK INTERNATIONAL AIRPORT, NY US 0.00 0.0 0.0 61 44
2022-10-21 USW00094789 JFK INTERNATIONAL AIRPORT, NY US 0.00 0.0 0.0 64 43

19287 rows × 7 columns

weather = weather.ffill()
#for example if the last day had a snow depth of 0, then the next day would also probably have a snow depth of zero too
weather.apply(pd.isnull).sum()
#now we can see that all the missing values have been filled and we have 0 missing values
station    0
name       0
prcp       0
snow       0
snwd       0
tmax       0
tmin       0
dtype: int64
weather.dtypes
#everything is stored as the correct type here
#object data type usually indicates that the colomn is a string 
station     object
name        object
prcp       float64
snow       float64
snwd       float64
tmax         int64
tmin         int64
dtype: object
weather.index
#we can see that our index is stored as a object 
Index(['1970-01-01', '1970-01-02', '1970-01-03', '1970-01-04', '1970-01-05',
       '1970-01-06', '1970-01-07', '1970-01-08', '1970-01-09', '1970-01-10',
       ...
       '2022-10-12', '2022-10-13', '2022-10-14', '2022-10-15', '2022-10-16',
       '2022-10-17', '2022-10-18', '2022-10-19', '2022-10-20', '2022-10-21'],
      dtype='object', name='DATE', length=19287)
weather.index = pd.to_datetime(weather.index)
weather.index
#now we can see that our index is stored as a date time
DatetimeIndex(['1970-01-01', '1970-01-02', '1970-01-03', '1970-01-04',
               '1970-01-05', '1970-01-06', '1970-01-07', '1970-01-08',
               '1970-01-09', '1970-01-10',
               ...
               '2022-10-12', '2022-10-13', '2022-10-14', '2022-10-15',
               '2022-10-16', '2022-10-17', '2022-10-18', '2022-10-19',
               '2022-10-20', '2022-10-21'],
              dtype='datetime64[ns]', name='DATE', length=19287, freq=None)
weather.index.year
#this gives us the year componenet of our data, you can't easily do this with a string
Int64Index([1970, 1970, 1970, 1970, 1970, 1970, 1970, 1970, 1970, 1970,
            ...
            2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022],
           dtype='int64', name='DATE', length=19287)
weather.index.year.value_counts().sort_index()
#it sorts the index in order by year
#counts how many times each unique value occurs, sees how many records we have for each year
#should be either 365 or 366 (for leap years)
1970    365
1971    365
1972    366
1973    365
1974    365
1975    365
1976    366
1977    365
1978    365
1979    365
1980    366
1981    365
1982    365
1983    365
1984    366
1985    365
1986    365
1987    365
1988    366
1989    365
1990    365
1991    365
1992    366
1993    365
1994    365
1995    365
1996    366
1997    365
1998    365
1999    365
2000    366
2001    365
2002    365
2003    365
2004    366
2005    365
2006    365
2007    365
2008    366
2009    365
2010    365
2011    365
2012    366
2013    365
2014    365
2015    365
2016    366
2017    365
2018    365
2019    365
2020    366
2021    365
2022    294
Name: DATE, dtype: int64
weather["snwd"].plot()
#creates car graph that shows snow depth by day
<AxesSubplot:xlabel='DATE'>
 
weather["target"] = weather.shift(-1)["tmax"]
#creating a target colomn in the weather data for predictions
#shift method keeps the same method but pulls value from the next row
weather
#for example Jan 1's target is Jan 2's Tmax
station name prcp snow snwd tmax tmin target
DATE
1970-01-01 USW00094789 JFK INTERNATIONAL AIRPORT, NY US 0.00 0.0 0.0 28 22 31.0
1970-01-02 USW00094789 JFK INTERNATIONAL AIRPORT, NY US 0.00 0.0 0.0 31 22 38.0
1970-01-03 USW00094789 JFK INTERNATIONAL AIRPORT, NY US 0.02 0.0 0.0 38 25 31.0
1970-01-04 USW00094789 JFK INTERNATIONAL AIRPORT, NY US 0.00 0.0 0.0 31 23 35.0
1970-01-05 USW00094789 JFK INTERNATIONAL AIRPORT, NY US 0.00 0.0 0.0 35 21 36.0
... ... ... ... ... ... ... ... ...
2022-10-17 USW00094789 JFK INTERNATIONAL AIRPORT, NY US 0.08 0.0 0.0 67 54 58.0
2022-10-18 USW00094789 JFK INTERNATIONAL AIRPORT, NY US 0.00 0.0 0.0 58 48 56.0
2022-10-19 USW00094789 JFK INTERNATIONAL AIRPORT, NY US 0.00 0.0 0.0 56 43 61.0
2022-10-20 USW00094789 JFK INTERNATIONAL AIRPORT, NY US 0.00 0.0 0.0 61 44 64.0
2022-10-21 USW00094789 JFK INTERNATIONAL AIRPORT, NY US 0.00 0.0 0.0 64 43 NaN

19287 rows × 8 columns

weather = weather.ffill()
#fill the vaule
weather
#one row having an incorrect target value will be insignificant with 20,000 rows in total
station name prcp snow snwd tmax tmin target
DATE
1970-01-01 USW00094789 JFK INTERNATIONAL AIRPORT, NY US 0.00 0.0 0.0 28 22 31.0
1970-01-02 USW00094789 JFK INTERNATIONAL AIRPORT, NY US 0.00 0.0 0.0 31 22 38.0
1970-01-03 USW00094789 JFK INTERNATIONAL AIRPORT, NY US 0.02 0.0 0.0 38 25 31.0
1970-01-04 USW00094789 JFK INTERNATIONAL AIRPORT, NY US 0.00 0.0 0.0 31 23 35.0
1970-01-05 USW00094789 JFK INTERNATIONAL AIRPORT, NY US 0.00 0.0 0.0 35 21 36.0
... ... ... ... ... ... ... ... ...
2022-10-17 USW00094789 JFK INTERNATIONAL AIRPORT, NY US 0.08 0.0 0.0 67 54 58.0
2022-10-18 USW00094789 JFK INTERNATIONAL AIRPORT, NY US 0.00 0.0 0.0 58 48 56.0
2022-10-19 USW00094789 JFK INTERNATIONAL AIRPORT, NY US 0.00 0.0 0.0 56 43 61.0
2022-10-20 USW00094789 JFK INTERNATIONAL AIRPORT, NY US 0.00 0.0 0.0 61 44 64.0
2022-10-21 USW00094789 JFK INTERNATIONAL AIRPORT, NY US 0.00 0.0 0.0 64 43 64.0

19287 rows × 8 columns

from sklearn.linear_model import Ridge
#apply ridge regression model
#"Ridge regression is a model tuning method that is used to analyse any data that suffers from multicollinearity"
#"Multicollinearity is a statistical concept where several independent variables in a model are correlated"
#ridge regression is similar to linear regression except that it penalizes coefficients to account for multicollinearity

rr = Ridge(alpha=.1)
#alpha parameter controls how much coefficeints are shrunk
weather.corr()
#checking for correlation between colomns
#can see that preceptiation is pretty uncorrelated from the rest of colomns
#but the rest are correlated to each other
prcp snow snwd tmax tmin target
prcp 1.000000 0.151051 0.001178 -0.007210 0.052279 -0.003358
snow 0.151051 1.000000 0.232567 -0.175075 -0.159285 -0.172737
snwd 0.001178 0.232567 1.000000 -0.259858 -0.256977 -0.241054
tmax -0.007210 -0.175075 -0.259858 1.000000 0.955487 0.915296
tmin 0.052279 -0.159285 -0.256977 0.955487 1.000000 0.915461
target -0.003358 -0.172737 -0.241054 0.915296 0.915461 1.000000
predictors = weather.columns[~weather.columns.isin(["target", "name", "station"])]
#create a list of predictor colomns 
#gives all the colomns in weather data except for these 3
# ~ negative operator looks for colomns not in the list
predictors
Index(['prcp', 'snow', 'snwd', 'tmax', 'tmin'], dtype='object')
def backtest(weather, model, predictors, start=3650, step=90):
    all_predictions = []
    
    for i in range(start, weather.shape[0], step):
        train = weather.iloc[:i,:]
        test = weather.iloc[i:(i+step),:]
        
        model.fit(train[predictors], train["target"])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["target"], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        combined["diff"] = (combined["prediction"] - combined["actual"]).abs()
        
        all_predictions.append(combined)
    return pd.concat(all_predictions)
predictions = backtest(weather, rr, predictors)
from sklearn.metrics import mean_absolute_error, mean_squared_error

mean_absolute_error(predictions["actual"], predictions["prediction"])
5.13932667966084
predictions.sort_values("diff", ascending=False)
actual prediction diff
DATE
2007-03-26 78.0 49.744725 28.255275
1999-01-02 53.0 25.911898 27.088102
1998-03-26 80.0 53.033957 26.966043
1985-04-18 84.0 57.071179 26.928821
1990-03-12 85.0 58.144310 26.855690
... ... ... ...
2006-08-20 88.0 88.002937 0.002937
2019-03-01 40.0 40.002460 0.002460
1992-06-11 76.0 76.001641 0.001641
2011-09-16 66.0 66.000831 0.000831
2014-03-25 39.0 38.999920 0.000080

15637 rows × 3 columns