In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import operator
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from random import sample 
from IPython.display import display

from utilsforecast.losses import *
from utilsforecast.plotting import plot_series

from mlforecast import MLForecast
from mlforecast.lag_transforms import ExpandingMean, RollingMean, RollingMin, RollingMax, RollingStd, Combine
from mlforecast.utils import PredictionIntervals
from mlforecast.target_transforms import GlobalSklearnTransformer

from hierarchicalforecast.methods import BottomUp, MinTrace
from hierarchicalforecast.utils import aggregate, HierarchicalPlot
from hierarchicalforecast.core import HierarchicalReconciliation

from numba import njit
from window_ops.rolling import rolling_mean
from sklearn.preprocessing import FunctionTransformer

from tsfeatures import tsfeatures

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor


pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [2]:
from datasetsforecast.m5 import M5

m5_df = M5()

Y_df, X_df, S_df = m5_df.load("../../data/raw")

In [3]:
def preprocess(Y_df:pd.DataFrame, S_df:pd.DataFrame, groupby:list):
    Y_df['ds'] = pd.to_datetime(Y_df['ds'])
    Y_df['month'] = Y_df['ds'].dt.to_period('M')

    all_df = pd.merge(S_df, Y_df, on="unique_id")
    group_df = all_df.groupby(groupby + ['month'])['y'].sum().reset_index()
    group_df['ds'] = pd.to_datetime(group_df['month'].dt.to_timestamp().dt.strftime('%Y-%m-01'))
    group_df = group_df[groupby + ["ds","y"]][(group_df["month"] > "2011-01-01") & (group_df["month"] < "2016-06-01")]
    group_df.columns = ["unique_id","ds","y"]

    group_df["unique_id"] = group_df["unique_id"].astype(str) # Category olması filtreledikten sonra gruplarken sıkıntı yaşamamıza sebep oluyor.

    ## minimum satış tarihi 2013 aralıktan sonra olan productlar çıkarılır.
    min_dates = pd.DataFrame(group_df[group_df["y"] > 0].groupby(["unique_id"])["ds"].min()).reset_index()
    filtered_df = group_df[group_df["unique_id"].isin(min_dates[min_dates["ds"]<="2013-12-01"]["unique_id"].to_list())]

    ## 2013 aralıktan sonraki dönemde aylık verilerin %50'sinden fazlası 0 olanlar çıkarılır.
    non_zero_day_count = filtered_df[(filtered_df["ds"]>="2013-12-01") & (filtered_df["y"]>0)].groupby("unique_id")["ds"].count().reset_index()
    non_zero_day_count["ratio"] = (non_zero_day_count["ds"] / non_zero_day_count["ds"].max()) * 100
    non_zero_day_unique_ids = non_zero_day_count[non_zero_day_count["ratio"]>50]["unique_id"].to_list()

    return filtered_df[filtered_df["unique_id"].isin(non_zero_day_unique_ids)]

In [4]:
all_df = preprocess(Y_df, S_df, ["item_id"])

In [5]:
cutoffs = ['2016-04-01 00:00:00','2016-03-01 00:00:00','2016-02-01 00:00:00','2016-01-01 00:00:00',
           '2015-12-01 00:00:00','2015-11-01 00:00:00','2015-10-01 00:00:00','2015-09-01 00:00:00',
           '2015-08-01 00:00:00','2015-07-01 00:00:00','2015-06-01 00:00:00','2015-05-01 00:00:00',
           ]

In [6]:
procesed_df = all_df.merge(S_df[['cat_id', 'dept_id', 'item_id']].drop_duplicates(), left_on="unique_id",right_on="item_id")
procesed_df.loc[:,"top_level"] = "Walmart"
procesed_df = procesed_df[['top_level','cat_id', 'dept_id', 'item_id','ds','y']]

In [7]:
hierarchy_levels = [['top_level'],
                    ['top_level', 'cat_id'],
                    ['top_level', 'cat_id', 'dept_id'],
                    ['top_level', 'cat_id', 'dept_id', 'item_id']]
Y_hier_df, S_hier_df, tags = aggregate(df=procesed_df, spec=hierarchy_levels)
Y_hier_df = Y_hier_df.reset_index()

In [8]:
eval_tags = {}
eval_tags['Total'] = tags['top_level']
eval_tags['Category'] = tags['top_level/cat_id']
eval_tags['Department'] = tags['top_level/cat_id/dept_id']
eval_tags['Item'] = tags['top_level/cat_id/dept_id/item_id']
eval_tags['All'] = np.concatenate(list(tags.values()))

def determine_category(unique_id):
    for category, values in eval_tags.items():
        if unique_id in values:
            return category
    return None

In [9]:
def is_summer(dates):
    """Date is summer"""
    return dates.month.isin([6,7,8])

def is_winter(dates):
    """Date is winter"""
    return dates.month.isin([12,1,2])

def is_fall(dates):
    """Date is fall"""
    return dates.month.isin([9,10,11])

def is_spring(dates):
    """Date is spring"""
    return dates.month.isin([3,4,5])

def is_last_month(dates):
    """ """
    return dates.month == 12

def is_first_month(dates):
    """ """
    return dates.month == 1

def month_sin_transform(dates):
    """ """
    period = 12
    return np.sin(dates.month / period * 2 * np.pi)

def month_cos_transform(dates):
    """ """
    period = 12
    return np.cos(dates.month / period * 2 * np.pi)

def quarter_sin_transform(dates):
    """ """
    period = 4
    return np.sin(dates.quarter / period * 2 * np.pi)

def quarter_cos_transform(dates):
    """ """
    period = 4
    return np.cos(dates.quarter / period * 2 * np.pi)

In [10]:
sk_log1p = FunctionTransformer(func=np.log1p, inverse_func=np.expm1, validate=True, check_inverse=False)

In [11]:
def predict_cv(all_data:pd.DataFrame, cut_off, horizon:int, S_df:pd.DataFrame, tags):

    df_train = all_data[all_data["ds"]<= cut_off]
    df_test = all_data[all_data["ds"] > cut_off].groupby('unique_id').head(horizon)

    fcst = MLForecast(
        models=[LGBMRegressor(verbosity = -1, objective="tweedie"), XGBRegressor(random_state=0, n_estimators=100, objective= "reg:tweedie")],
        freq="MS",
        lags=[1, 2, 3, 4, 5, 6, 12],
        lag_transforms = {
            1:  [RollingMean(3), RollingMean(6), RollingMin(6), RollingMax(6), RollingStd(6),],
            6:  [RollingMean(3), RollingMean(6), RollingMin(6), RollingMax(6), RollingStd(6)],
            12: [RollingMean(3), RollingMean(6), RollingMin(6), RollingMax(6), RollingStd(6)]
        },
        date_features = ["year", "month", "quarter", month_sin_transform, month_cos_transform, quarter_sin_transform, 
                        quarter_cos_transform, is_summer, is_winter, is_fall, is_spring, is_first_month, is_last_month],
        target_transforms=[GlobalSklearnTransformer(sk_log1p)],
        num_threads=8
    )

    fcst.fit(df_train, 
             fitted=True, 
             static_features=[])
    Y_hat_df = fcst.predict(horizon).set_index('unique_id')
    Y_fitted_df = fcst.forecast_fitted_values()

    reconcilers = [BottomUp(), 
                   MinTrace(method='ols'),
                   MinTrace(method='mint_shrink')] 
    hrec = HierarchicalReconciliation(reconcilers=reconcilers)

    Y_rec_df = hrec.reconcile(Y_hat_df=Y_hat_df,
                              Y_df=Y_fitted_df,
                              S=S_df, 
                              tags=tags)
    
    y_pred = Y_rec_df.merge(df_test[["unique_id","ds","y"]], on =["unique_id","ds"])

    y_pred["cutoff"] = cut_off

    return y_pred

In [14]:
def cross_validation(cutoffs:list, all_data:pd.DataFrame, horizon:int, S_df:pd.DataFrame, tags):

    all_pred = pd.DataFrame()
    for cut_off in cutoffs:
        pred = predict_cv(all_data, cut_off, horizon, S_df, tags)

        all_pred = pd.concat([all_pred, pred], axis = 0)

    return all_pred

In [15]:
all_pred = cross_validation(cutoffs, Y_hier_df, 1, S_hier_df, tags)

### With Error

In [16]:
def predict_cv(all_data:pd.DataFrame, cut_off, horizon:int, S_df:pd.DataFrame, tags):

    df_train = all_data[all_data["ds"]<= cut_off]
    df_test = all_data[all_data["ds"] > cut_off].groupby('unique_id').head(horizon)

    fcst = MLForecast(
        models=[LGBMRegressor(verbosity = -1, objective="tweedie"), XGBRegressor(random_state=0, n_estimators=100, objective= "reg:tweedie")],
        freq="MS",
        lags=[1, 2, 3, 4, 5, 6, 12],
        lag_transforms = {
            1:  [RollingMean(3), RollingMean(6), RollingMin(6), RollingMax(6), RollingStd(6),Combine(
                RollingMean(window_size=3),
                RollingMean(window_size=12),
                operator.truediv,
            )],
            6:  [RollingMean(3), RollingMean(6), RollingMin(6), RollingMax(6), RollingStd(6),Combine(
                RollingMean(window_size=3),
                RollingMean(window_size=12),
                operator.truediv,
            )],
            12: [RollingMean(3), RollingMean(6), RollingMin(6), RollingMax(6), RollingStd(6)]
        },
        date_features = ["year", "month", "quarter", month_sin_transform, month_cos_transform, quarter_sin_transform, 
                        quarter_cos_transform, is_summer, is_winter, is_fall, is_spring, is_first_month, is_last_month],
        target_transforms=[GlobalSklearnTransformer(sk_log1p)],
        num_threads=8
    )

    fcst.fit(df_train, 
             fitted=True, 
             static_features=[])
    Y_hat_df = fcst.predict(horizon).set_index('unique_id')
    Y_fitted_df = fcst.forecast_fitted_values()

    reconcilers = [BottomUp(), 
                   MinTrace(method='ols'),
                   MinTrace(method='mint_shrink')] 
    hrec = HierarchicalReconciliation(reconcilers=reconcilers)

    Y_rec_df = hrec.reconcile(Y_hat_df=Y_hat_df,
                              Y_df=Y_fitted_df,
                              S=S_df, 
                              tags=tags)
    
    y_pred = Y_rec_df.merge(df_test[["unique_id","ds","y"]], on =["unique_id","ds"])

    y_pred["cutoff"] = cut_off

    return y_pred

In [17]:
def cross_validation(cutoffs:list, all_data:pd.DataFrame, horizon:int, S_df:pd.DataFrame, tags):

    all_pred = pd.DataFrame()
    for cut_off in cutoffs:
        pred = predict_cv(all_data, cut_off, horizon, S_df, tags)

        all_pred = pd.concat([all_pred, pred], axis = 0)

    return all_pred

In [18]:
all_pred = cross_validation(cutoffs, Y_hier_df, 1, S_hier_df, tags)

Exception: min_trace (mint_shrink) needs covariance matrix to be positive definite.