import pandas as pd    #pandas is a data analysis library for python

weather = pd.read_csv("weather.csv", index_col="DATE")   
# pandas read csv function, reads in weather csv file. specifies that the first colomn (date colomn) is the index

weather

null_pct = weather.apply(pd.isnull).sum()/weather.shape[0]
null_pct
#finds the number of null values in each colomn 
#then divides it by the total number of rows

STATION    0.000000
NAME       0.000000
ACMH       0.501478
ACSH       0.501426
AWND       0.265256
FMTM       0.475087
PGTM       0.363872
PRCP       0.000000
SNOW       0.000000
SNWD       0.000104
TAVG       0.680406
TMAX       0.000000
TMIN       0.000000
TSUN       0.998393
WDF1       0.501685
WDF2       0.498678
WDF5       0.502981
WDFG       0.734484
WDFM       0.999948
WESD       0.685228
WSF1       0.501530
WSF2       0.498678
WSF5       0.503033
WSFG       0.613055
WSFM       0.999948
WT01       0.630217
WT02       0.935034
WT03       0.933271
WT04       0.982579
WT05       0.981127
WT06       0.990615
WT07       0.994400
WT08       0.796962
WT09       0.992741
WT11       0.999274
WT13       0.886711
WT14       0.954010
WT15       0.997822
WT16       0.658993
WT17       0.996889
WT18       0.939493
WT21       0.999741
WT22       0.997459
WV01       0.999948
dtype: float64

valid_columns = weather.columns[null_pct < .05]

valid_columns
#these are the colomns with less than 5% missing values

Index(['STATION', 'NAME', 'PRCP', 'SNOW', 'SNWD', 'TMAX', 'TMIN'], dtype='object')

weather = weather[valid_columns].copy()
#preserves only the above colomns in our data
#.copy() prevents us from getting a copy warning later

weather.columns = weather.columns.str.lower()

weather

weather = weather.ffill()
#for example if the last day had a snow depth of 0, then the next day would also probably have a snow depth of zero too

weather.apply(pd.isnull).sum()
#now we can see that all the missing values have been filled and we have 0 missing values

station    0
name       0
prcp       0
snow       0
snwd       0
tmax       0
tmin       0
dtype: int64

weather.dtypes
#everything is stored as the correct type here
#object data type usually indicates that the colomn is a string

station     object
name        object
prcp       float64
snow       float64
snwd       float64
tmax         int64
tmin         int64
dtype: object

weather.index
#we can see that our index is stored as a object

Index(['1970-01-01', '1970-01-02', '1970-01-03', '1970-01-04', '1970-01-05',
       '1970-01-06', '1970-01-07', '1970-01-08', '1970-01-09', '1970-01-10',
       ...
       '2022-10-12', '2022-10-13', '2022-10-14', '2022-10-15', '2022-10-16',
       '2022-10-17', '2022-10-18', '2022-10-19', '2022-10-20', '2022-10-21'],
      dtype='object', name='DATE', length=19287)

weather.index = pd.to_datetime(weather.index)

weather.index
#now we can see that our index is stored as a date time

DatetimeIndex(['1970-01-01', '1970-01-02', '1970-01-03', '1970-01-04',
               '1970-01-05', '1970-01-06', '1970-01-07', '1970-01-08',
               '1970-01-09', '1970-01-10',
               ...
               '2022-10-12', '2022-10-13', '2022-10-14', '2022-10-15',
               '2022-10-16', '2022-10-17', '2022-10-18', '2022-10-19',
               '2022-10-20', '2022-10-21'],
              dtype='datetime64[ns]', name='DATE', length=19287, freq=None)

weather.index.year
#this gives us the year componenet of our data, you can't easily do this with a string

Int64Index([1970, 1970, 1970, 1970, 1970, 1970, 1970, 1970, 1970, 1970,
            ...
            2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022],
           dtype='int64', name='DATE', length=19287)

weather.index.year.value_counts().sort_index()
#it sorts the index in order by year
#counts how many times each unique value occurs, sees how many records we have for each year
#should be either 365 or 366 (for leap years)

1970    365
1971    365
1972    366
1973    365
1974    365
1975    365
1976    366
1977    365
1978    365
1979    365
1980    366
1981    365
1982    365
1983    365
1984    366
1985    365
1986    365
1987    365
1988    366
1989    365
1990    365
1991    365
1992    366
1993    365
1994    365
1995    365
1996    366
1997    365
1998    365
1999    365
2000    366
2001    365
2002    365
2003    365
2004    366
2005    365
2006    365
2007    365
2008    366
2009    365
2010    365
2011    365
2012    366
2013    365
2014    365
2015    365
2016    366
2017    365
2018    365
2019    365
2020    366
2021    365
2022    294
Name: DATE, dtype: int64

weather["snwd"].plot()
#creates car graph that shows snow depth by day

<AxesSubplot:xlabel='DATE'>

weather["target"] = weather.shift(-1)["tmax"]
#creating a target colomn in the weather data for predictions
#shift method keeps the same method but pulls value from the next row

weather
#for example Jan 1's target is Jan 2's Tmax

weather = weather.ffill()
#fill the vaule

weather
#one row having an incorrect target value will be insignificant with 20,000 rows in total

from sklearn.linear_model import Ridge
#apply ridge regression model
#"Ridge regression is a model tuning method that is used to analyse any data that suffers from multicollinearity"
#"Multicollinearity is a statistical concept where several independent variables in a model are correlated"
#ridge regression is similar to linear regression except that it penalizes coefficients to account for multicollinearity

rr = Ridge(alpha=.1)
#alpha parameter controls how much coefficeints are shrunk

weather.corr()
#checking for correlation between colomns
#can see that preceptiation is pretty uncorrelated from the rest of colomns
#but the rest are correlated to each other

predictors = weather.columns[~weather.columns.isin(["target", "name", "station"])]
#create a list of predictor colomns 
#gives all the colomns in weather data except for these 3
# ~ negative operator looks for colomns not in the list

predictors

Index(['prcp', 'snow', 'snwd', 'tmax', 'tmin'], dtype='object')

def backtest(weather, model, predictors, start=3650, step=90):
    all_predictions = []
    
    for i in range(start, weather.shape[0], step):
        train = weather.iloc[:i,:]
        test = weather.iloc[i:(i+step),:]
        
        model.fit(train[predictors], train["target"])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["target"], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        combined["diff"] = (combined["prediction"] - combined["actual"]).abs()
        
        all_predictions.append(combined)
    return pd.concat(all_predictions)

predictions = backtest(weather, rr, predictors)

from sklearn.metrics import mean_absolute_error, mean_squared_error

mean_absolute_error(predictions["actual"], predictions["prediction"])

5.13932667966084

predictions.sort_values("diff", ascending=False)

	STATION	NAME	ACMH	ACSH	AWND	FMTM	PGTM	PRCP	SNOW	SNWD	...	WT11	WT13	WT14	WT15	WT16	WT17	WT18	WT21	WT22	WV01
DATE
1970-01-01	USW00094789	JFK INTERNATIONAL AIRPORT, NY US	80.0	90.0	NaN	NaN	NaN	0.00	0.0	0.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1970-01-02	USW00094789	JFK INTERNATIONAL AIRPORT, NY US	30.0	20.0	NaN	NaN	NaN	0.00	0.0	0.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1970-01-03	USW00094789	JFK INTERNATIONAL AIRPORT, NY US	80.0	100.0	NaN	NaN	NaN	0.02	0.0	0.0	...	NaN	NaN	NaN	NaN	1.0	NaN	1.0	NaN	NaN	NaN
1970-01-04	USW00094789	JFK INTERNATIONAL AIRPORT, NY US	10.0	20.0	NaN	NaN	NaN	0.00	0.0	0.0	...	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN	NaN
1970-01-05	USW00094789	JFK INTERNATIONAL AIRPORT, NY US	30.0	10.0	NaN	NaN	NaN	0.00	0.0	0.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2022-10-17	USW00094789	JFK INTERNATIONAL AIRPORT, NY US	NaN	NaN	9.62	NaN	NaN	0.08	0.0	0.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2022-10-18	USW00094789	JFK INTERNATIONAL AIRPORT, NY US	NaN	NaN	12.08	NaN	NaN	0.00	0.0	0.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2022-10-19	USW00094789	JFK INTERNATIONAL AIRPORT, NY US	NaN	NaN	14.99	NaN	NaN	0.00	0.0	0.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2022-10-20	USW00094789	JFK INTERNATIONAL AIRPORT, NY US	NaN	NaN	16.78	NaN	10.0	0.00	0.0	0.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2022-10-21	USW00094789	JFK INTERNATIONAL AIRPORT, NY US	NaN	NaN	NaN	NaN	NaN	0.00	0.0	0.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	prcp	snow	snwd	tmax	tmin	target
prcp	1.000000	0.151051	0.001178	-0.007210	0.052279	-0.003358
snow	0.151051	1.000000	0.232567	-0.175075	-0.159285	-0.172737
snwd	0.001178	0.232567	1.000000	-0.259858	-0.256977	-0.241054
tmax	-0.007210	-0.175075	-0.259858	1.000000	0.955487	0.915296
tmin	0.052279	-0.159285	-0.256977	0.955487	1.000000	0.915461
target	-0.003358	-0.172737	-0.241054	0.915296	0.915461	1.000000

	actual	prediction	diff
DATE
2007-03-26	78.0	49.744725	28.255275
1999-01-02	53.0	25.911898	27.088102
1998-03-26	80.0	53.033957	26.966043
1985-04-18	84.0	57.071179	26.928821
1990-03-12	85.0	58.144310	26.855690
...	...	...	...
2006-08-20	88.0	88.002937	0.002937
2019-03-01	40.0	40.002460	0.002460
1992-06-11	76.0	76.001641	0.001641
2011-09-16	66.0	66.000831	0.000831
2014-03-25	39.0	38.999920	0.000080