Real World Application of Time Series ARIMA Model in forecasting

Objective: To Predict SO2 concentration in Dwarka, Delhi region for next few months using ARIMA Model.

The data has been taken from cpcb website by means of web scraping using selenium and beautifulsoup

Code: https://github.com/ninjakx/CPCB/blob/master/cpcb.py

Project Full Code: https://github.com/ninjakx/AQP/

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

%cd drive/My Drive

[Errno 2] No such file or directory: 'drive/My Drive'
/content/drive/My Drive/AQP-master - Copy (2)/AQP-master/Machine learning

%cd AQP-master - Copy (2)/AQP-master/Machine learning

[Errno 2] No such file or directory: 'AQP-master - Copy (2)/AQP-master/Machine learning'
/content/drive/My Drive/AQP-master - Copy (2)/AQP-master/Machine learning

import numpy as np
import pandas as pd
import matplotlib as plt
import warnings
warnings.filterwarnings('ignore')

'''arr = ['index.1', 'Nitrogen Dioxide(NO2).1', 'Bar Pressure(Bar Pressure).1',
       'PM 10(RSPM).1', 'PM 2.5(PM2.5).1', 'Sulfur Dioxide(SO2).1',
       'Temperature(TEMP).1']
for i in range(2010,2017):
    data = pd.read_csv("airdata/dwarka-"+str(i)+".csv",sep=';',encoding = 'windows-1252')
    for col in data.columns:
        #print("col:",col)
        if col in arr:
            print(col)
            del data[col]'''

'arr = [\'index.1\', \'Nitrogen Dioxide(NO2).1\', \'Bar Pressure(Bar Pressure).1\',\n       \'PM 10(RSPM).1\', \'PM 2.5(PM2.5).1\', \'Sulfur Dioxide(SO2).1\',\n       \'Temperature(TEMP).1\']\nfor i in range(2010,2017):\n    data = pd.read_csv("airdata/dwarka-"+str(i)+".csv",sep=\';\',encoding = \'windows-1252\')\n    for col in data.columns:\n        #print("col:",col)\n        if col in arr:\n            print(col)\n            del data[col]'

files = ["airdata/dwarka-"+str(i)+".csv" for i in range(2010,2020)]
dfs = [pd.read_csv(fp, sep=';',encoding = 'windows-1252', decimal=',') for fp in files]
#df = pd.concat(dfs).drop_duplicates().reset_index(drop=True)
#print (df)

dfs[8]['index'] = dfs[8]['Date']  # 2018
del dfs[8]['Date']
dfs[9]['index'] = dfs[9]['Date']  # 2019
del dfs[9]['Date']

import re
def date_extractor(string):
    matches = re.findall('(\d{2,4}[\/\- ]\d{2}[\/\- ]\d{2,4})', string)
    return matches[0]

import dateutil
for df1 in dfs:
    df1['index']=df1['index'].astype(str).apply(lambda x: date_extractor(x))
    df1['index']=df1['index'].astype(str).apply(lambda x: dateutil.parser.parse(x))
df = pd.concat(dfs)

df = df.rename(columns = {'index':'Date','Bar Pressure(Bar Pressure)':'Bar Pressure(BP)'})
df.head()

	Bar Pressure(BP)	Nitrogen Dioxide(NO2)	PM 10(RSPM)	PM 2.5(PM2.5)	Solar Radiation(SR)	Sulfur Dioxide(SO2)	Temperature(TEMP)	Date
0	646.09	12.3	124.12	NaN	NaN	5.85	14.61	2010-01-02
1	732.95	27.28	70.62	NaN	NaN	5.05	23.87	2010-01-03
2	735.69	29.57	109.88	NaN	NaN	23.84	31.54	2010-01-04
3	734.58	15.93	61.12	NaN	NaN	6.36	30.12	2010-01-05
4	729.43	45.35	163.67	NaN	NaN	4.42	34.22	2010-01-06

data = df[['Bar Pressure(BP)','Nitrogen Dioxide(NO2)','PM 10(RSPM)','PM 2.5(PM2.5)','Solar Radiation(SR)'\
    ,'Sulfur Dioxide(SO2)','Temperature(TEMP)']].groupby([df.Date.dt.year,df.Date.dt.month]).agg('count').rename_axis(['Year','Month'])

d = pd.DataFrame(data)
d.columns

Index(['Bar Pressure(BP)', 'Nitrogen Dioxide(NO2)', 'PM 10(RSPM)',
       'PM 2.5(PM2.5)', 'Solar Radiation(SR)', 'Sulfur Dioxide(SO2)',
       'Temperature(TEMP)'],
      dtype='object')

d.unstack()

	Bar Pressure(BP)										...	Temperature(TEMP)
Month	1	2	3	4	5	6	7	8	9	10	...	3	4	5	6	7	8	9	10	11	12
Year
2010	30	27	30	29	26	29	31	31	30	31	...	29	28	25	28	31	31	30	31	30	31
2011	31	27	29	28	31	30	31	31	30	31	...	29	28	31	30	31	31	30	31	30	29
2012	30	26	27	25	29	28	30	30	26	21	...	27	25	29	28	30	30	23	21	19	26
2013	31	28	28	30	30	29	30	29	24	28	...	28	30	30	29	30	29	24	28	28	29
2014	31	28	31	29	30	29	29	31	30	30	...	31	29	30	29	29	31	30	30	30	29
2015	31	27	31	30	31	29	29	28	29	31	...	31	30	31	29	29	28	29	31	29	30
2016	0	0	0	0	0	0	0	0	0	0	...	29	29	20	23	29	27	25	29	28	25
2017	0	0	0	0	0	0	0	0	0	0	...	30	29	30	29	30	25	30	31	30	30
2018	0	0	0	0	0	0	0	0	0	0	...	29	29	30	29	28	30	29	30	27	11
2019	0	0	0	0	0	0	0	0	0	0	...	21	2	2	1	1	1	1	1	1	2

10 rows × 84 columns

EDA

bar_pressure = d['Bar Pressure(BP)'].unstack().loc[2012]
bar_pressure

Month
   30
   26
   27
   25
   29
   28
   30
   30
   26
  21
  19
  26
Name: 2012, dtype: int64

import seaborn as sns
import matplotlib.pyplot as plt
ax = sns.barplot(x = bar_pressure.index, y = bar_pressure.values)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
ax.set(xlabel='months', ylabel='BP values',title='Bar Pressure values for 2012')

[Text(0, 0.5, 'BP values'),
 Text(0.5, 0, 'months'),
 Text(0.5, 1.0, 'Bar Pressure values for 2012')]

png

ax=d['Bar Pressure(BP)'].unstack().plot.bar(figsize=(20,5),align='edge', width=0.7)
ax.grid(zorder=0)
ax.set(xlabel='months', ylabel='BP values',title='Bar Pressure values for each year by month wise')
#d.unstack(1).plot.barh()

[Text(0, 0.5, 'BP values'),
 Text(0.5, 0, 'months'),
 Text(0.5, 1.0, 'Bar Pressure values for each year by month wise')]

png

ax=d['Nitrogen Dioxide(NO2)'].unstack().plot.bar(figsize=(20,5),align='edge', width=0.7)
ax.grid(zorder=0)
ax.set(xlabel='months', ylabel='NO2 values',title='NO2 values for each year by month wise')

[Text(0, 0.5, 'NO2 values'),
 Text(0.5, 0, 'months'),
 Text(0.5, 1.0, 'NO2 values for each year by month wise')]

png

ax=d['PM 10(RSPM)'].unstack().plot.bar(figsize=(20,5),align='edge', width=0.7)
ax.grid(zorder=0)
ax.set(xlabel='months', ylabel='PM 10 values',title='RSPM values for each year by month wise')

[Text(0, 0.5, 'PM 10 values'),
 Text(0.5, 0, 'months'),
 Text(0.5, 1.0, 'RSPM values for each year by month wise')]

png

ax=d['PM 2.5(PM2.5)'].unstack().plot.bar(figsize=(20,5),align='edge', width=0.7)
ax.grid(zorder=0)
ax.set(xlabel='months', ylabel='PM 2.5 values',title='PM 2.5 values for each year by month wise')

[Text(0, 0.5, 'PM 2.5 values'),
 Text(0.5, 0, 'months'),
 Text(0.5, 1.0, 'PM 2.5 values for each year by month wise')]

png

ax=d['Solar Radiation(SR)'].unstack().plot.bar(figsize=(20,5),align='edge', width=0.7)
ax.grid(zorder=0)
ax.set(xlabel='months', ylabel='SR values',title='SR values for each year by month wise')

[Text(0, 0.5, 'SR values'),
 Text(0.5, 0, 'months'),
 Text(0.5, 1.0, 'SR values for each year by month wise')]

png

ax=d['Sulfur Dioxide(SO2)'].unstack().plot.bar(figsize=(20,5),align='edge', width=0.7)
ax.grid(zorder=0)
ax.set(xlabel='months', ylabel='SO2 values',title='SO2 values for each year by month wise')

[Text(0, 0.5, 'SO2 values'),
 Text(0.5, 0, 'months'),
 Text(0.5, 1.0, 'SO2 values for each year by month wise')]

png

ax=d['Temperature(TEMP)'].unstack().plot.bar(figsize=(20,5),align='edge', width=0.7)
ax.grid(zorder=0)
ax.set(xlabel='months', ylabel='Temp values',title='Temp values for each year by month wise')

[Text(0, 0.5, 'Temp values'),
 Text(0.5, 0, 'months'),
 Text(0.5, 1.0, 'Temp values for each year by month wise')]

png

def colour_cell(val):
    if val*100/365 < 50:
      color = '#FF6347' 
    elif val*100/365 < 85:
      color = 'yellow'
    else:
      color = '#00FF00'  
    return 'Background-color: %s' % color

Table showing available data for each year

#d.groupby(['Year','Month'])[['Bar Pressure(Bar Pressure)']].sum()
d.groupby('Year').sum().style.applymap(colour_cell)

	Bar Pressure(BP)	Nitrogen Dioxide(NO2)	PM 10(RSPM)	PM 2.5(PM2.5)	Solar Radiation(SR)	Sulfur Dioxide(SO2)	Temperature(TEMP)
Year
2010	355	357	354	0	0	302	349
2011	358	333	347	0	0	341	358
2012	317	148	311	0	0	304	314
2013	344	321	354	0	0	354	344
2014	358	354	339	0	0	352	357
2015	355	326	80	248	0	351	355
2016	0	321	0	318	192	319	323
2017	0	349	0	326	351	346	351
2018	0	328	0	328	328	327	328
2019	0	68	0	69	69	69	69

Observations:

By looking at the graphs and statistics we will take NO2 and SO2 concentration from period 2013 to 2017 as number of missing data values are less.

SO2 concentration in Dwarka

df.head()

	Bar Pressure(BP)	Nitrogen Dioxide(NO2)	PM 10(RSPM)	PM 2.5(PM2.5)	Solar Radiation(SR)	Sulfur Dioxide(SO2)	Temperature(TEMP)	Date
0	646.09	12.3	124.12	NaN	NaN	5.85	14.61	2010-01-02
1	732.95	27.28	70.62	NaN	NaN	5.05	23.87	2010-01-03
2	735.69	29.57	109.88	NaN	NaN	23.84	31.54	2010-01-04
3	734.58	15.93	61.12	NaN	NaN	6.36	30.12	2010-01-05
4	729.43	45.35	163.67	NaN	NaN	4.42	34.22	2010-01-06

data = df.copy()
data.index = pd.to_datetime(data.Date, format='%Y-%m-%d')
data.sort_index(inplace = True)
print(data.index)

DatetimeIndex(['2010-01-02', '2010-01-03', '2010-01-04', '2010-01-05',
               '2010-01-06', '2010-01-07', '2010-01-08', '2010-01-09',
               '2010-01-10', '2010-01-11',
               ...
               '2019-05-02', '2019-05-04', '2019-06-02', '2019-07-02',
               '2019-08-02', '2019-09-02', '2019-10-02', '2019-11-02',
               '2019-12-02', '2019-12-03'],
              dtype='datetime64[ns]', name='Date', length=3179, freq=None)

data.drop(['Date'],axis=1,inplace=True)

data = data.convert_objects(convert_numeric=True)

data = data.interpolate(method='linear', axis=0).ffill().bfill()

import seaborn as sns
import matplotlib as mpl
'''def myFormatter(x, pos):
  return pd.to_datetime(x).strftime('%d/%m/%Y')
plt.figure(figsize=(15,5))
ax=sns.tsplot(data['Sulfur Dioxide(SO2)'].values,time = data.index)
ax.xaxis.set_major_formatter(mpl.ticker.FuncFormatter(myFormatter))
ax.set(xlabel='Year', ylabel='SO2 values',title='SO2 Concentration')'''

"def myFormatter(x, pos):\n  return pd.to_datetime(x).strftime('%d/%m/%Y')\nplt.figure(figsize=(15,5))\nax=sns.tsplot(data['Sulfur Dioxide(SO2)'].values,time = data.index)\nax.xaxis.set_major_formatter(mpl.ticker.FuncFormatter(myFormatter))\nax.set(xlabel='Year', ylabel='SO2 values',title='SO2 Concentration')"

import matplotlib.pyplot as plt 
data1 = data['Sulfur Dioxide(SO2)'].truncate(before = '2013-01-01')
data1 = data['Sulfur Dioxide(SO2)'].truncate(after = '2018-12-01')
#plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%d/%m/%Y'))
#plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=2))
#data1.plot(figsize=(16,5)) 
data1.plot(figsize=(14,5), linewidth=2, fontsize=20)
plt.xlabel('Year', fontsize=20);
plt.title("SO2",fontsize=25)
plt.ylabel("ug/m3",fontsize=20)
plt.show()

png

# https://www.machinelearningplus.com/time-series/time-series-analysis-python/
# Import Data
df = data.truncate(before = '2013-01-01').copy()
#df.reset_index(inplace=True)

# Prepare data
df['year'] = [d.year for d in df.index]
df['month'] = [d.strftime('%b') for d in df.index]
years = df['year'].unique()

# Draw Plot
fig, axes = plt.subplots(1, 2, figsize=(17,7), dpi= 80)
sns.boxplot(x='year', y='Sulfur Dioxide(SO2)', data=df, ax=axes[0])
sns.boxplot(x='month', y='Sulfur Dioxide(SO2)', data=df.loc[:, :])

# Set Title
axes[0].set_title('Year-wise Box Plot\n(The Trend)', fontsize=18); 
axes[1].set_title('Month-wise Box Plot\n(The Seasonality)', fontsize=18)
plt.show()

png

We can see there is low trend which we can ignore.
The months of November and December clearly has higher SO2 concentration, which can be attributed to the festive season.

data_so2 = data.copy()
data_so2=data_so2[['Sulfur Dioxide(SO2)']].truncate(before = '2013-01-01')
data_so2['year'] = data_so2.index.year
data_so2['doy'] = data_so2.index.dayofyear
piv = pd.pivot_table(data_so2,values=['Sulfur Dioxide(SO2)'], index = ['doy'], columns = ['year'])
piv['Sulfur Dioxide(SO2)'].plot(figsize=(15,5),title='SO2')
plt.xlabel('Timestamp')
plt.ylabel('SO2 Concentration')

Text(0, 0.5, 'SO2 Concentration')

png

Check if a data set or time series is random by Lag Plot

A lag plot checks whether a data set or time series is random or not. Random data should not exhibit any identifiable structure in the lag plot. Non-random structure in the lag plot indicates that the underlying data are not random. It is normally used to check for autocorrelation.

#https://www.itl.nist.gov/div898/handbook/eda/section3/lagplot.htm
from pandas.plotting import lag_plot
lag_plot(data1)
plt.show()

png

We can see it’s less autocorrelated as points get wide and scattered with increasing lags.

Using box plot we can see the outliers.

# library & dataset
import seaborn as sns
# Make boxplot for one group only
sns.boxplot( y=data.values )
#sns.plt.show()

<matplotlib.axes._subplots.AxesSubplot at 0x7f0c766ddeb8>

png

Are the data random?
No

Is there serial correlation in the data?
No

Are their outliers in the data?
Yes

Checking randomness in time series using auto-correlation plot of Pandas

Autocorrelation plots are often used for checking randomness in time series. This is done by computing autocorrelations for data values at varying time lags. If time series is random, such autocorrelations should be near zero for any and all time-lag separations. If time series is non-random then one or more of the autocorrelations will be significantly non-zero. The horizontal lines displayed in the plot correspond to 95% and 99% confidence bands. The dashed line is 99% confidence band.

from pandas.plotting import autocorrelation_plot
autocorrelation_plot(data1)

<matplotlib.axes._subplots.AxesSubplot at 0x7f0c7665bc18>

png

It resembles to sine waves so it indicates that a value strongly correlates with another data point in the future, hence there is seasonality in the dataset. And also the data is not random as it gives non zero values for most of the lags.

Checking Stationality by Summary Statistics

#https://stackoverflow.com/questions/7805552/fitting-a-histogram-with-python
from scipy.stats import norm
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
'''# Plot histogram
data1.hist()
plt.show()'''

# best fit of data
(mu, sigma) = norm.fit(data1)

# the histogram of the data
n, bins, patches = plt.hist(data1, 60, normed=1, facecolor='green', alpha=0.75)

# add a 'best fit' line
y = mlab.normpdf( bins, mu, sigma)
l = plt.plot(bins, y, 'r--', linewidth=2)

#plot
plt.xlabel('Smarts')
plt.ylabel('Probability')
plt.title(r'$\mathrm{Histogram\ of\ IQ:}\ \mu=%.3f,\ \sigma=%.3f$' %(mu, sigma))
plt.grid(True)

plt.show()

png

Common Probability Distribution alt text

By looking at the above distribution, our data resembles to log normal distribution.

Now split the time series data into two parts and compare the mean variance of each group if they differ and the difference is statistically significant, the time series is likely non-stationary.

X = data1.values
split = int(len(X) / 2)
X1, X2 = X[0:split], X[split:]
mean1, mean2 = X1.mean(), X2.mean()
var1, var2 = X1.var(), X2.var()
print('mean1=%.2f, mean2=%.2f' % (mean1, mean2))
print('variance1=%.2f, variance2=%.2f' % (var1, var2))

mean1=9.34, mean2=10.02
variance1=81.28, variance2=56.19

Mean and Variance of each group differ. Hence it may not be stationary. Let’s take log of the above data since it resembles to the log normal distribution and plot its distribution curve.

X = data1.values
X = np.log(X)
#plt.hist(X)
#plt.title('Distribution after taking log')

# best fit of data
(mu, sigma) = norm.fit(X)

# the histogram of the data
n, bins, patches = plt.hist(X, 60, normed=1, facecolor='green', alpha=0.75)

# add a 'best fit' line
y = mlab.normpdf( bins, mu, sigma)
l = plt.plot(bins, y, 'r--', linewidth=2)

#plot
plt.xlabel('Smarts')
plt.ylabel('Probability')
plt.title(r'$\mathrm{Histogram\ of\ IQ:}\ \mu=%.3f,\ \sigma=%.3f$' %(mu, sigma))
plt.grid(True)

plt.show()

png

The distribution looks like a gaussian distribution. So the data is stationary having some stationary component.

Checking Stationarity by Plotting Rolling Statistics

from statsmodels.tsa.stattools import adfuller
def test_stationarity(timeseries):
    plt.figure(figsize=(14,5))
    #Determing rolling statistics
    rolmean = timeseries.rolling(12).mean()
    rolstd = timeseries.rolling(12).std()

    #Plot rolling statistics:
    orig = plt.plot(timeseries, color='blue',label='Original')
    mean = plt.plot(rolmean, color='red', label='Rolling Mean')
    std = plt.plot(rolstd, color='black', label = 'Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation of SO2')
    plt.xlabel('Year', fontsize=20);
    plt.ylabel("ug/m3",fontsize=20)
    plt.show(block=False)
    #Perform Dickey-Fuller test:
    print('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print(dfoutput)
    
test_stationarity(data1)

png

Results of Dickey-Fuller Test:
Test Statistic                   -5.443141
p-value                           0.000003
#Lags Used                       29.000000
Number of Observations Used    3070.000000
Critical Value (1%)              -3.432482
Critical Value (5%)              -2.862482
Critical Value (10%)             -2.567271
dtype: float64

Mean and variance are changing so not a stationary time series. There is seasonality present by looking at the graph.

Checking Stationarity by Augmented Dickey-Fuller test

The Augmented Dickey-Fuller test is a type of statistical test called a unit root test.

The intuition behind a unit root test is that it determines how strongly a time series is defined by a trend.

There are a number of unit root tests and the Augmented Dickey-Fuller may be one of the more widely used. It uses an autoregressive model and optimizes an information criterion across multiple different lag values.

The null hypothesis of the test is that the time series can be represented by a unit root, that it is not stationary (has some time-dependent structure). The alternate hypothesis (rejecting the null hypothesis) is that the time series is stationary.

Null Hypothesis (H0): If accepted, it suggests the time series has a unit root, meaning it is non-stationary. It has some time dependent structure.

Alternate Hypothesis (H1): The null hypothesis is rejected; it suggests the time series does not have a unit root, meaning it is stationary. It does not have time-dependent structure. We interpret this result using the p-value from the test. A p-value below a threshold (such as 5% or 1%) suggests we reject the null hypothesis (stationary), otherwise a p-value above the threshold suggests we accept the null hypothesis (non-stationary).

p-value > 0.05: Accept the null hypothesis (H0), the data has a unit root and is non-stationary.

p-value <= 0.05: Reject the null hypothesis (H0), the data does not have a unit root and is stationary.

For example, if p-value = 0.2924, it means that null hypothesis will be rejected only around 30% which is quite a high considering the traditional level of significance (1%, 5%, and 10%).

# https://machinelearningmastery.com/time-series-data-stationary-python/
from statsmodels.tsa.stattools import adfuller
X = data1.values
result = adfuller(X)
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
    print('\t%s: %.3f' % (key, value))

ADF Statistic: -5.443141
p-value: 0.000003
Critical Values:
	1%: -3.432
	5%: -2.862
	10%: -2.567

test statistic value of -5.443141. The more negative this statistic, the more likely we are to reject the null hypothesis (we have a stationary dataset).

As part of the output, we get a look-up table to help determine the ADF statistic. We can see that our statistic value of -5.443141 is less than the value of -3.432 at 1%.

This suggests that we can reject the null hypothesis with a significance level of less than 1% (i.e. a low probability that the result is a statistical fluke).

Rejecting the null hypothesis means that the process has no unit root, and in turn that the time series is stationary or does not have time-dependent structure.

Why do we make the time series stationary?

Auto Regressive’ in ARIMA means it is a linear regression model that uses its own lags as predictors. Linear regression models, as you know, work best when the predictors are not correlated and are independent of each other.

ESTIMATING THE TREND

Using rolling average

There are several ways to think about identifying trends in time series. One popular way is by taking a Rolling average, which means that, for each time point, you take the average of the points on either side of it. Note that the number of points is specified by a window size, which you need to choose.

What happens then because you take the average is it tends to smooth out noise and seasonality.

When it comes to determining the window size, here, it makes sense to first try out one of twelve months, as you’re talking about yearly seasonality.

fig = plt.figure(figsize=(12,6))
data1.plot(linewidth=2, fontsize=20,label='SO2')
data1.rolling(12).mean().plot(linewidth=2, fontsize=20, color='r',label='Rolling mean')
plt.title("SO2",fontsize=25)
plt.xlabel('Year', fontsize=20)
plt.ylabel("ug/m3",fontsize=20)
plt.legend()
plt.show()

png

Now you have the trend that you’re looking for! You have removed most of the seasonality compared to the previous plot. But there can still be the seasonality left because the graph shows repeatitive patterns.

Using Regression method

import matplotlib.pyplot as plt
fig = plt.figure(figsize=(14,6))
from sklearn.linear_model import LinearRegression
model = LinearRegression()
X = [i for i in range(0, len(data1))]
X = np.reshape(X, (len(X), 1))
y = data1
model.fit(X, y)
trend = model.predict(X)
trend = pd.DataFrame(trend)
trend.index = data1.index
# plot trend
plt.plot(y,label='SO2')
plt.plot(trend,label='Trend')
plt.xlabel('Year', fontsize=20);
plt.title("SO2 (Original)",fontsize=25)
plt.ylabel("ug/m3",fontsize=20)
plt.legend()
plt.show()

png

It shows upward trend.

ELIMINATING THE TREND

Transformation

We can use transformation such as log, square root, cube root etc to reduce the trend.

plt.figure(figsize=(14,6))
ts_log = np.log(data1)
plt.plot(ts_log)
plt.xlabel('Year', fontsize=20);
plt.title("SO2 (Log Transformed)",fontsize=25)
plt.ylabel("ug/m3",fontsize=20)
plt.show()

png

No Trend can be seen after applying log transformation. Before applying log we saw there is high seasonality hence we will focus more on seasonality.

ELIMINATING SEASONALITY

Differencing

plt.figure(figsize=(14,6))
ts_log_diff = ts_log - ts_log.shift()
plt.plot(ts_log_diff)
plt.xlabel('Year', fontsize=20);
plt.title("SO2 (Log Transformed + 1st Order Differencing)",fontsize=25)
plt.ylabel("ug/m3",fontsize=20)
plt.show()

png

ts_log_diff.dropna(inplace=True)
test_stationarity(ts_log_diff)

png

Results of Dickey-Fuller Test:
Test Statistic                  -19.437741
p-value                           0.000000
#Lags Used                       29.000000
Number of Observations Used    3069.000000
Critical Value (1%)              -3.432483
Critical Value (5%)              -2.862482
Critical Value (10%)             -2.567272
dtype: float64

Next we will decompose data into residual, trend

Decomposition

from statsmodels.tsa.seasonal import seasonal_decompose
plt.figure(figsize=(10,10))
decomposition = seasonal_decompose(ts_log,freq=12)

trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid

plt.subplot(411)
plt.plot(ts_log, label='Original')
plt.legend(loc='best')
plt.subplot(412)
plt.plot(trend, label='Trend')
plt.legend(loc='best')
plt.subplot(413)
plt.plot(seasonal,label='Seasonality')
plt.legend(loc='best')
plt.subplot(414)
plt.plot(residual, label='Residuals')
plt.legend(loc='best')
plt.tight_layout()

png

Stationary and Non-Stationary Time Series

time_series Source: R’s TSTutorial

import numpy as np, pandas as pd
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.figsize':(9,7), 'figure.dpi':120})

# Import data
df = data1
# Original Series
fig, axes = plt.subplots(3, 2)
axes[0, 0].plot(df.values); axes[0, 0].set_title('Original Series')
plot_acf(df.values, ax=axes[0, 1])
axes[0, 1].set_xlim([0,20])

# 1st Differencing
axes[1, 0].plot(np.diff(df.values)); axes[1, 0].set_title('1st Order Differencing')
plot_acf(np.diff(df.values), ax=axes[1, 1])
axes[1, 1].set_xlim([0,20])

# 2nd Differencing
axes[2, 0].plot(np.diff(np.diff(df.values))); axes[2, 0].set_title('2nd Order Differencing')
plot_acf(np.diff(np.diff(df.values)), ax=axes[2, 1])
axes[2, 1].set_xlim([0,20])
fig.tight_layout()
plt.show()

png

We will take 1st order differencing as increasing the order can lead to overdifferencing

ARIMA(p,d,q)

# PACF plot of 1st differenced series
# https://www.machinelearningplus.com/time-series/arima-model-time-series-forecasting-python/
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
plt.rcParams.update({'figure.figsize':(9,3), 'figure.dpi':120})

plt.plot(ts_log.diff()) 
plt.title('1st Differencing')

plot_pacf(ts_log.diff().dropna(),lags=20)
plt.title
plt.show()

png

By using Partial Autocorrelation Graph, We can take p value as 1

# PACF plot of 1st differenced series
# https://www.machinelearningplus.com/time-series/arima-model-time-series-forecasting-python/
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
plt.rcParams.update({'figure.figsize':(9,3), 'figure.dpi':120})

plt.plot(ts_log.diff()) 
plt.title('1st Differencing')

plot_acf(ts_log.diff().dropna(),lags=20)
plt.title
plt.show()

png

By using Autocorrelation Graph, we can take q = 1

from statsmodels.tsa.arima_model import ARIMA

# 1,1,2 ARIMA Model
model = ARIMA(ts_log,order=(1,1,1))
model_fit = model.fit(disp=0)
print(model_fit.summary())

                               ARIMA Model Results                               
=================================================================================
Dep. Variable:     D.Sulfur Dioxide(SO2)   No. Observations:                 3099
Model:                    ARIMA(1, 1, 1)   Log Likelihood               -2569.272
Method:                          css-mle   S.D. of innovations              0.554
Date:                   Sat, 27 Apr 2019   AIC                           5146.544
Time:                           09:02:51   BIC                           5170.700
Sample:                                1   HQIC                          5155.219
                                                                                 
===============================================================================================
                                  coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const                           0.0003      0.001      0.403      0.687      -0.001       0.002
ar.L1.D.Sulfur Dioxide(SO2)     0.4312      0.019     22.187      0.000       0.393       0.469
ma.L1.D.Sulfur Dioxide(SO2)    -0.9533      0.008   -122.400      0.000      -0.969      -0.938
                                    Roots                                    
=============================================================================
                  Real          Imaginary           Modulus         Frequency
-----------------------------------------------------------------------------
AR.1            2.3190           +0.0000j            2.3190            0.0000
MA.1            1.0490           +0.0000j            1.0490            0.0000
-----------------------------------------------------------------------------

# Plot residual errors
residuals = pd.DataFrame(model_fit.resid)
fig, ax = plt.subplots(1,2)
residuals.plot(title="Residuals", ax=ax[0])
residuals.plot(kind='kde', title='Density', ax=ax[1])
plt.show()

png

print(residuals.describe())

                 0
count  3099.000000
mean     -0.000302
std       0.554533
min      -2.000433
25%      -0.285896
50%      -0.011139
75%       0.267756
max       2.657653

We are plotting residual to see some trend information not captured by the model. we get a density plot of the residual error values, suggesting the errors are Gaussian, but may not be centered on zero.

The results shows residual error values are gaussian and is centered on zero.

# Actual vs Fitted
model_fit.plot_predict(dynamic=False)
plt.show()

png

from statsmodels.tsa.stattools import acf

# Create Training and Test
train = df[:3000]
test = df[3000:]

# Build Model
# model = ARIMA(train, order=(3,2,1))  
model = ARIMA(train, order=(1, 1, 1))  
fitted = model.fit(disp=-1)  

# Forecast
fc, se, conf = fitted.forecast(len(test), alpha=0.05)  # 95% conf

# Make as pandas series
fc_series = pd.Series(fc, index=test.index)
lower_series = pd.Series(conf[:, 0], index=test.index)
upper_series = pd.Series(conf[:, 1], index=test.index)

# Plot
plt.figure(figsize=(12,5), dpi=100)
plt.plot(train, label='training')
plt.plot(test, label='actual')
plt.plot(fc_series, label='forecast')
plt.fill_between(lower_series.index, lower_series, upper_series, 
                 color='k', alpha=.15)
plt.title('Forecast vs Actuals')
plt.legend(loc='upper left', fontsize=8)
plt.show()

png

The reason we are getting straight line as forecast is because there is lot of seasonal components so we will go for SARIMA which uses seasonal differencing. Seasonal differencing is similar to regular differencing, but, instead of subtracting consecutive terms, you subtract the value from previous season.

df1=df.copy
df=data1
dfseries = ts_log

from statsmodels.tsa.stattools import acf

# Create Training and Test
train = dfseries[:3000]
test = dfseries[3000:]

import statsmodels.api as sm
mod = sm.tsa.statespace.SARIMAX(dfseries,
                                order=(1, 1, 1),
                                seasonal_order=(1, 1, 1, 6),
                                enforce_stationarity=False,
                                enforce_invertibility=False)

results = mod.fit()

print(results.summary().tables[1])

==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.4323      0.015     29.318      0.000       0.403       0.461
ma.L1         -0.9559      0.006   -161.708      0.000      -0.967      -0.944
ar.S.L6        0.0385      0.017      2.298      0.022       0.006       0.071
ma.S.L6       -1.0002      0.051    -19.484      0.000      -1.101      -0.900
sigma2         0.3063      0.017     17.935      0.000       0.273       0.340
==============================================================================

results.plot_diagnostics(figsize=(10, 8))
plt.show()

png

how to interpret the plot diagnostics?

Top left: The residual errors seem to fluctuate around a mean of zero and have a uniform variance.

Top Right: The density plot suggest normal distribution with mean zero.

Bottom left: All the dots should fall perfectly in line with the red line. Any significant deviations would imply the distribution is skewed.

Bottom Right: The Correlogram, aka, ACF plot shows the residual errors are not autocorrelated. Any autocorrelation would imply that there is some pattern in the residual errors which are not explained in the model. So you will need to look for more X’s (predictors) to the model.

pred = results.get_prediction(start=3000, dynamic=False)
pred_ci = pred.conf_int()

plt.rcParams.update({'figure.figsize':(10,4), 'figure.dpi':120})
ax = np.exp(dfseries).plot(label='observed')
np.exp(pred.predicted_mean).plot(ax=ax, label='One-step ahead Forecast', alpha=.7,color='r')

ax.fill_between(pred_ci.index,
                pred_ci.iloc[:, 0],
                pred_ci.iloc[:, 1], color='k', alpha=.2)

ax.set_xlabel('Date')
ax.set_ylabel('SO2 Levels')
plt.legend()

plt.show()

png

y_forecasted = pred.predicted_mean
y_truth = test

# Compute the mean square error
mse = ((np.exp(y_forecasted) - np.exp(y_truth)) ** 2).mean()
print('The Mean Squared Error of our forecasts (MSE) is {}'.format(round(mse, 2)))
print('The Root Mean Squared Error of our forecasts (RMSE) is {}'.format(round(mse**0.5, 2)))

The Mean Squared Error of our forecasts (MSE) is 36.42
The Root Mean Squared Error of our forecasts (RMSE) is 6.03

df_err = pd.DataFrame(list(zip(np.exp(y_forecasted),np.exp(y_truth))), index=y_forecasted.index, columns =['Predicted','Actual']) 
df_err 

	Predicted	Actual
Date
2018-08-19	8.712283	16.75
2018-08-20	11.458510	15.44
2018-08-21	11.169078	12.83
2018-08-22	10.110420	11.03
2018-08-23	10.013742	9.77
2018-08-24	9.748626	7.61
2018-08-25	8.712044	7.38
2018-08-26	7.950748	7.12
2018-08-27	7.927625	7.94
2018-08-28	8.115069	10.83
2018-08-29	9.846503	9.47
2018-08-30	9.399130	12.00
2018-08-31	10.419743	10.97
2018-09-01	9.469851	12.64
2018-09-02	10.489269	10.60
2018-09-03	9.625009	19.97
2018-09-04	13.572208	5.65
2018-09-05	7.817236	10.60
2018-09-06	10.185522	6.00
2018-09-07	7.395344	7.76
2018-09-08	8.397610	4.55
2018-09-09	6.543036	11.94
2018-09-10	10.050886	17.55
2018-09-11	13.035648	23.81
2018-09-13	14.566187	8.23
2018-09-14	8.645796	9.89
2018-09-15	9.385712	17.43
2018-09-16	12.689951	17.24
2018-09-17	13.372236	14.90
2018-09-18	12.988379	16.41
...	...	...
2018-10-30	20.887943	22.04
2018-10-31	19.072010	28.55
2018-11-01	21.469495	7.03
2018-11-02	10.368022	14.44
2018-11-03	14.851825	13.07
2018-11-04	14.094758	5.87
2018-11-05	9.880495	11.59
2018-11-06	13.816855	2.63
2018-11-07	6.273186	8.54
2018-11-08	10.539238	8.01
2018-11-09	10.215746	3.67
2018-11-10	6.649106	14.18
2018-11-11	13.438349	25.10
2018-11-13	16.790090	26.55
2018-11-14	18.579586	21.61
2018-11-15	15.824492	15.72
2018-11-16	13.703732	12.29
2018-11-17	12.899622	15.72
2018-11-18	15.150317	17.29
2018-11-19	16.024347	18.43
2018-11-20	16.202653	16.28
2018-11-21	14.390149	16.51
2018-11-22	14.838832	24.96
2018-11-23	18.150540	27.21
2018-11-24	20.076176	15.02
2018-11-25	15.599335	18.64
2018-11-26	16.919921	25.12
2018-11-27	18.566732	26.36
2018-11-28	20.022663	12.10
2018-12-01	13.743775	10.17

100 rows × 2 columns

df_err.plot(rot=45)

<matplotlib.axes._subplots.AxesSubplot at 0x7f0c760a48d0>

png

Conclusion:

The Mean Squared Error of our forecasts (MSE) is 36.42

The Root Mean Squared Error of our forecasts (RMSE) is 6.03