Auto ARIMA & LSTM
import pmdarima
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima_model import ARIMA
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
import pickle
path = '/user/antste/user/jjin.choi/python/lab/time-series/data/license'
filename = 'project_target_license.pickle'
with open(os.path.join(path, filename), mode = 'rb') as f:
df_target_dict = pickle.load(f)
#df_target_dict[$PROJECT][$FEATURE]
path = '/user/antste/user/jjin.choi/python/lab/time-series/data/license'
filename = 'project_target_feature.pickle'
with open(os.path.join(path, filename), mode = 'rb') as f:
feature_dict = pickle.load(f)
import pmdarima
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima_model import ARIMA
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
import pickle
path = '/user/antste/user/jjin.choi/python/lab/time-series/data/license'
filename = 'project_target_license.pickle'
with open(os.path.join(path, filename), mode = 'rb') as f:
df_target_dict = pickle.load(f)
#df_target_dict[$PROJECT][$FEATURE]
path = '/user/antste/user/jjin.choi/python/lab/time-series/data/license'
filename = 'project_target_feature.pickle'
with open(os.path.join(path, filename), mode = 'rb') as f:
feature_dict = pickle.load(f)
filename = 'mid-range_target_license.pickle'
with open(os.path.join(path, filename), mode = 'rb') as f:
midrange_dict = pickle.load(f)
filename = 'high-end_target_license.pickle'
with open(os.path.join(path, filename), mode = 'rb') as f:
highend_dict = pickle.load(f)
filename = 'custom_target_license.pickle'
with open(os.path.join(path, filename), mode = 'rb') as f:
custom_dict = pickle.load(f)
filename = 'connectivity_target_license.pickle'
with open(os.path.join(path, filename), mode = 'rb') as f:
connectivity_dict = pickle.load(f)
# project list
path = '/user/antste/user/jjin.choi/python/lab/time-series/data'
project_csv = 'project_info.csv'
project_df = pd.read_csv(os.path.join(path, project_csv))
project_list = list(project_df[project_df.code != '0'].code)
#project_df
#project_list
# REFERENCE CODE for Differencing
#from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
#import matplotlib.pyplot as plt
#plt.rcParams.update({'figure.figsize':(9,7), 'figure.dpi':120})
# Original Series
#fig, axes = plt.subplots(3, 2, sharex=True)
#axes[0, 0].plot(df.value); axes[0, 0].set_title('Original Series')
#plot_acf(df.value, ax=axes[0, 1])
# 1st Differencing
#axes[1, 0].plot(df.value.diff()); axes[1, 0].set_title('1st Order Differencing')
#plot_acf(df.value.diff().dropna(), ax=axes[1, 1])
# 2nd Differencing
#axes[2, 0].plot(df.value.diff().diff()); axes[2, 0].set_title('2nd Order Differencing')
#plot_acf(df.value.diff().diff().dropna(), ax=axes[2, 1])
#plt.show()
from statsmodels.tsa.stattools import adfuller
def adf_test(timeseries):
print("Results of Dickey-Fuller Test:")
dftest = adfuller(timeseries, autolag="AIC")
dfoutput = pd.Series(
dftest[0:4],
index=[
"Test Statistic",
"p-value",
"#Lags Used",
"Number of Observations Used",
],
)
for key, value in dftest[4].items():
dfoutput["Critical Value (%s)" % key] = value
print(dfoutput)
if dfoutput[1] > 0.05:
print("Need differencing")
from statsmodels.tsa.stattools import kpss
def kpss_test(timeseries):
print("Results of KPSS Test:")
kpsstest = kpss(timeseries, regression="c", nlags="auto")
kpss_output = pd.Series(
kpsstest[0:3], index=["Test Statistic", "p-value", "Lags Used"]
)
for key, value in kpsstest[3].items():
kpss_output["Critical Value (%s)" % key] = value
print(kpss_output)
def for_subplots(proj_dict):
fig = plt.figure(figsize = (30, 20))
for i, feat in enumerate(proj_dict.keys()):
df = proj_dict[feat]['timestamp', 'lic_usage'](/jjin-choi/study_note/wiki/'timestamp',-'lic_usage')
df.set_index('timestamp', inplace = True)
feat_num = len(proj_dict.keys())
ax = fig.add_subplot(int(feat_num/5)+1, 5, i+1)
ax.plot(df)
ax.get_xaxis().set_visible(False)
ax.set_title(feat)
plt.subplots_adjust(hspace=1.0)
plt.show()
for_subplots(flagship_dict)
#flagship_dict['Affirma_sim_analysis_env'].set_index('timestamp')
def predict(df):
from pmdarima.arima import auto_arima
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.2)
plt.plot(train)
plt.plot(test)
arima_model = auto_arima(train, start_p = 0, d = 1, start_q = 0, max_p = 5, max_d = 5, max_q =5, start_P = 0, D = 1, start_Q = 0, max_P = 5, max_Q = 5, m = 12, seasonal = True, error_action = 'warn', trace = True, supress_warnings = True, stepwise = True, random_state = 20, n_fits = 20)
arima_model.summary()
def predict_scatter(df):
from pmdarima.arima import auto_arima
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.2)
ax = plt.axes()
ax.scatter(train['timestamp'], train['lic_usage'], c = 'DarkBlue', marker = ".")
ax.scatter(test['timestamp'], test['lic_usage'], c = 'Orange', marker = ".")
plt.show()
test.set_index('timestamp', inplace = True)
arima_model = auto_arima(test, start_p = 0, d = 1, start_q = 0, max_p = 5, max_d = 5, max_q =5, start_P = 0, D = 1, start_Q = 0, max_P = 5, max_Q =5, m = 12, seasonal = True, error_action = 'warn', trace = True, supress_warnings = True, stepwise = True, random_state = 20, n_fits = 20)
arima_model.fit(train)
arima_model.summary()
max(flagship_df['timestamp'])
len(flagship_df['timestamp'])
min(flagship_df['timestamp'])
def predict_tscv(df):
from pmdarima.arima import auto_arima
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits = 10)
X = df.timestamp
y = df.lic_usage
for train_idx, test_idx in tscv.split(df.lic_usage):
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
ax = plt.axes()
ax.scatter(X_train, y_train, c = 'DarkBlue', marker = ".")
ax.scatter(X_test, y_test, c = 'Orange', marker = ".")
train = pd.concat([X_train, y_train], axis = 1, join = 'inner')
test = pd.concat([X_test, y_test], axis = 1, join = 'inner')
plt.show()
test.set_index('timestamp', inplace = True)
train.set_index('timestamp', inplace = True)
arima_model = auto_arima(test, start_p = 0, d = 1, start_q = 0, max_p = 5, max_d = 5, max_q =5, start_P = 0, D = 1, start_Q = 0, max_P = 5, max_Q =5, m = 12, seasonal = True, error_action = 'warn', trace = True, supress_warnings = True, stepwise = True, random_state = 20, n_fits = 20)
res = arima_model.fit(train)
#dis, bx = plt.subplots()
#bx = df.loc['2019-9-11'].plot(bx = bx)
#dis = res.plot_predict('2021-1-12', '2022-7-12', dynamic = True, bx = bx, plot_insample= False)
arima_model.summary()
return train, test, arima_model
def predict_tscv(df):
from pmdarima.arima import auto_arima
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits = 10)
X = df.timestamp
y = df.lic_usage
for train_idx, test_idx in tscv.split(df.lic_usage):
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
ax = plt.axes()
ax.scatter(X_train, y_train, c = 'DarkBlue', marker = ".")
ax.scatter(X_test, y_test, c = 'Orange', marker = ".")
train = pd.concat([X_train, y_train], axis = 1, join = 'inner')
test = pd.concat([X_test, y_test], axis = 1, join = 'inner')
plt.show()
test.set_index('timestamp', inplace = True)
train.set_index('timestamp', inplace = True)
arima_model = auto_arima(test, start_p = 0, d = 1, start_q = 0, max_p = 5, max_d = 5, max_q =5, start_P = 0, D = 1, start_Q = 0, max_P = 5,
max_Q =5, m = 12, seasonal = True, error_action = 'warn', trace = True, supress_warnings = True, stepwise = True, random_state = 20,
n_fits = 20)
res = arima_model.fit(train)
arima_model.summary()
return train, test, arima_model
def make_separate_timestamp(df, sep):
sep_list = np.unique(df[sep])
time_list = list(df[df[sep] == sep_list[0]].timestamp)
start = time_list[0]
end = time_list[-1]
total_list = time_list
for s in sep_list[1:]:
time_list = list(df[df[sep] == s].timestamp)
start = time_list[0]
offset = pd.DateOffset((end - start).days + 1)
time_list = pd.DatetimeIndex(time_list) + offset
end = time_list[-1]
total_list.extend(time_list)
total_list = pd.Series(total_list, name='timestamp_')
df = pd.concat([df, total_list], axis=1)
return df
flagship_sample = flagship_dict[key_list_flagship[1]]
flagship_sample.info()
make_separate_timestamp(flagship_sample, 'project')
flagship3_df = flagship_sample_['timestamp_','lic_usage'](/jjin-choi/study_note/wiki/'timestamp_','lic_usage')
#flagship1_df
flagship3_df['timestamp_'] = np.datetime_as_string(flagship3_df['timestamp_'] , unit='D')
flagship3_df.info()
#predict_tscv(flagship_df)
#datetime64[ns] - flagship_df['timestamp'] = pd.to_datetime(flagship_df['timestamp'])
#datetime - flagship_df['timestamp'] = flagship_df['timestamp'].dt.date
#flagship_df['timestamp'] = flagship_df['timestamp'].astype(str).str[:10]
#flagship_df['timestamp'] = flagship_df['timestamp'].dt.strftime('%Y-%m-%d')
flagship_df['timestamp'] = np.datetime_as_string(flagship_df['timestamp'] , unit='D')
flagship_df['timestamp']
train, test, arima_model = predict_tscv(flagship3_df)
train.reset_index(level= 0, inplace=True)
test.reset_index(level= 0, inplace=True)
forecast_index = pd.date_range(pd.to_datetime(test.timestamp_.iloc[-1]) + pd.DateOffset(1), periods=forecast.shape[0], freq='D')
forecast = arima_model.predict(n_periods=len(test_))
forecast = pd.DataFrame(forecast, index = forecast_index, columns=['predicted_usage'])
forecast.reset_index(level=0, inplace=True)
forecast['timestamp'] = np.datetime_as_string(forecast['index'] , unit='D')
forecast.info()
#train.reset_index(drop=True, inplace=True)
#test.reset_index(drop=True, inplace=True)
#forecast.reset_index(drop=True, inplace=True)
cx = plt.axes()
# Plot the predictions for validation(test) set with train and test data
cx.scatter(train['timestamp_'], train['lic_usage'], label='Train', c = 'Blue', marker = ".")
cx.scatter(test['timestamp_'], test['lic_usage'], label='Validation', c = 'Orange', marker = ".")
cx.scatter(forecast['timestamp'], forecast.predicted_usage, label = 'Prediction', c = 'Red', marker = '.')
#train.reset_index(level= 0, inplace=True)
#test.reset_index(level= 0, inplace=True)
#train.drop(columns=['level_0', 'index'], inplace = True)
#test.drop(columns=['level_0', 'index'], inplace = True)
test
train
forecast = arima_model.predict(n_periods=len(test))
forecast = pd.DataFrame(forecast, index = test.index, columns=['predicted usage'])
#forecast.reset_index(level=0, inplace=True)
cx = plt.axes()
cx.scatter(forecast.index, forecast.predicted_usage, label = 'Prediction', marker = '.')
plt.show()
cx = plt.axes()
cx.plot(forecast, label='Prediction')
plt.show()
#====================================== EXAMPLE CODES ====================================#
#calculate rmse
from math import sqrt
from sklearn.metrics import mean_squared_error
rms = sqrt(mean_squared_error(test,forecast))
print(rms)
flagship_df.set_index('timestamp', inplace = True)
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
plot_acf(flagship_df)
plot_pacf(flagship_df)
#predict_scatter(flagship_df)
#predict(flagship_df)
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
plt.rcParams.update({'figure.figsize':(9,3), 'figure.dpi':120})
fig, axes = plt.subplots(1, 2, sharex=True)
axes[0].plot(flagship_df.diff()); axes[0].set_title('1st Differencing')
axes[1].set(ylim=(0,1.2))
plot_acf(flagship_df.diff().dropna(), ax=axes[1])
plt.show()
def objfunc(order, exog, endog):
from statsmodels.tsa.arima_model import ARIMA
fit = ARIMA(endog, order, exog).fit()
return fit.aic()
from scipy.optimize import brute
grid = (slice(1, 3, 1), slice(1, 3, 1), slice(1, 3, 1))
brute(objfunc, grid, args=(exog, endog), finish=None)
#library(forecast)
#library(Metrics)
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(train_features, train_labels);
#calculate rmse
from math import sqrt
from sklearn.metrics import mean_squared_error
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)
rms = sqrt(mean_squared_error(valid,forecast))
print(rms)
# Import the library
from pmdarima import auto_arima
# Ignore harmless warnings
import warnings
warnings.filterwarnings("ignore")
# Fit auto_arima function to AirPassengers dataset
stepwise_fit = auto_arima(airline['# Passengers'], start_p = 1, start_q = 1,
max_p = 3, max_q = 3, m = 12,
start_P = 0, seasonal = True,
d = None, D = 1, trace = True,
error_action ='ignore', # we don't want to know if an order does not work
suppress_warnings = True, # we don't want convergence warnings
stepwise = True) # set to stepwise
# To print the summary
stepwise_fit.summary()
LSTM
import os
import pickle
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
path = '/user/antste/user/jjin.choi/python/lab/time-series/data/license'
filename = 'flagship_target_license.pickle'
with open(os.path.join(path, filename), mode='rb') as f:
feature_df_dict = pickle.load(f)
feature_list = list(feature_df_dict.keys())
temp_df = feature_df_dict[feature_list[4]].groupby('timestamp').agg({'lic_usage':'sum'})
plt.plot(temp_df)
plt.xticks(visible=False)
plt.show()
# X, y
X = temp_df
y = temp_df
X.shape
# Normalize and split train / test set
from sklearn.preprocessing import StandardScaler, MinMaxScaler
mm = MinMaxScaler()
ss = StandardScaler()
X_ss = ss.fit_transform(X)
y_mm = mm.fit_transform(y)
# Train Data
split = int(X.shape[0] * 0.9)
X_train = X_ss[:split, :]
X_test = X_ss[split:, :]
# Test Data (not required)
y_train = y_mm[:split, :]
y_test = y_mm[split:, :]
print("Training Shape : ", X_train.shape)
print("Testing Shape : ", X_test.shape)
# Preprocessing
X_train_tensors = Variable(torch.Tensor(X_train))
X_test_tensors = Variable(torch.Tensor(X_test))
y_train_tensors = Variable(torch.Tensor(y_train))
y_test_tensors = Variable(torch.Tensor(y_test))
X_train_tensors_final = torch.reshape(X_train_tensors, (X_train_tensors.shape[0], 1, X_train_tensors.shape[1]))
X_test_tensors_final = torch.reshape(X_test_tensors, (X_test_tensors.shape[0], 1, X_test_tensors.shape[1]))
print("Training Shape", X_train_tensors_final.shape, y_train_tensors.shape)
print("Testing Shape", X_test_tensors_final.shape, y_test_tensors.shape)
# Set GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # device
#print(torch.cuda.get_device_name(0))
# LSTM network (LSTM layer 1, Sequence length = 1)
class LSTM1(nn.Module):
def __init__(self, num_classes, input_size, hidden_size, num_layers, seq_length):
super(LSTM1, self).__init__()
self.num_classes = num_classes #number of classes
self.num_layers = num_layers #number of layers
self.input_size = input_size #input size
self.hidden_size = hidden_size #hidden state
self.seq_length = seq_length #sequence length
self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True) #lstm
self.fc_1 = nn.Linear(hidden_size, 128) #fully connected 1
self.fc = nn.Linear(128, num_classes) #fully connected last layer
self.relu = nn.ReLU()
def forward(self,x):
h_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).to(device) #hidden state
c_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).to(device) #internal state
# Propagate input through LSTM
output, (hn, cn) = self.lstm(x, (h_0, c_0)) #lstm with input, hidden, and internal state
hn = hn.view(-1, self.hidden_size) #reshaping the data for Dense layer next
out = self.relu(hn)
out = self.fc_1(out) #first Dense
out = self.relu(out) #relu
out = self.fc(out) #Final Output
return out
X_train_tensors_final.shape
# Network parameter
num_epochs = 30000 #1000 epochs
learning_rate = 0.00001 #0.001 lr
input_size = 1 #number of features
hidden_size = 2 #number of features in hidden state
num_layers = 1 #number of stacked lstm layers
num_classes = 1 #number of output classes
#lstm1 = LSTM1(num_classes, input_size, hidden_size, num_layers, X_train_tensors_final.shape[1]).to(device)
lstm1 = LSTM1(num_classes, input_size, hidden_size, num_layers, 3).to(device)
loss_function = torch.nn.MSELoss() # mean-squared error for regression
optimizer = torch.optim.Adam(lstm1.parameters(), lr=learning_rate) # adam optimizer
# Learning
for epoch in range(num_epochs):
outputs = lstm1.forward(X_train_tensors_final.to(device)) #forward pass
optimizer.zero_grad() #caluclate the gradient, manually setting to 0
# obtain the loss function
loss = loss_function(outputs, y_train_tensors.to(device))
loss.backward() #calculates the loss of the loss function
optimizer.step() #improve from loss, i.e backprop
if epoch % 100 == 0:
print("Epoch: %d, loss: %1.5f" % (epoch, loss.item()))
# Predict
df_X_ss = ss.transform(temp_df)
df_y_mm = mm.transform(temp_df)
df_X_ss = Variable(torch.Tensor(df_X_ss)) #converting to Tensors
df_y_mm = Variable(torch.Tensor(df_y_mm))
#reshaping the dataset
df_X_ss = torch.reshape(df_X_ss, (df_X_ss.shape[0], 1, df_X_ss.shape[1]))
train_predict = lstm1(df_X_ss.to(device)) #forward pass
data_predict = train_predict.data.detach().cpu().numpy() #numpy conversion
dataY_plot = df_y_mm.data.numpy()
data_predict = mm.inverse_transform(data_predict) #reverse transformation
dataY_plot = mm.inverse_transform(dataY_plot)
plt.figure(figsize=(20,10)) #plotting
plt.axvline(x=split, c='r', linestyle='--') #size of the training set
plt.plot(dataY_plot, marker='o', label='Actuall Data') #actual plot
plt.plot(data_predict, marker='x', label='Predicted Data') #predicted plot
plt.title('Time-Series Prediction')
plt.legend()
plt.show()