# Import libraries
import pandas as pd
import numpy as np
from math import pi
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error
import lightgbm as lgb
from lightgbm import LGBMRegressor
import xgboost as xgb
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold

# Ignore all warnings
import warnings
warnings.simplefilter("ignore")


base_train_df = pd.read_csv('/kaggle/input/rohlik-orders-forecasting-challenge/train.csv')
base_train_df.head(5)


base_test_df = pd.read_csv('/kaggle/input/rohlik-orders-forecasting-challenge/test.csv')
base_test_df.head(5)


# Base features
base_features = base_test_df.drop(columns=['id']).columns
test_id = base_test_df['id']


# train_df = pd.concat([base_train_df[base_features], base_train_df['orders']], axis=1)
train_df = pd.concat([base_train_df[base_features], base_train_df['orders']], axis=1)
test_df = base_test_df[base_features]


# Info of train or test datasets
print(train_df.info())
print('='*60)
print(test_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7340 entries, 0 to 7339
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   warehouse               7340 non-null   object 
 1   date                    7340 non-null   object 
 2   holiday_name            218 non-null    object 
 3   holiday                 7340 non-null   int64  
 4   shops_closed            7340 non-null   int64  
 5   winter_school_holidays  7340 non-null   int64  
 6   school_holidays         7340 non-null   int64  
 7   orders                  7340 non-null   float64
dtypes: float64(1), int64(4), object(3)
memory usage: 458.9+ KB
None
============================================================
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397 entries, 0 to 396
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   warehouse               397 non-null    object
 1   date                    397 non-null    object
 2   holiday_name            17 non-null     object
 3   holiday                 397 non-null    int64 
 4   shops_closed            397 non-null    int64 
 5   winter_school_holidays  397 non-null    int64 
 6   school_holidays         397 non-null    int64 
dtypes: int64(4), object(3)
memory usage: 21.8+ KB
None


# Concat train data + test data
all_df = pd.concat([train_df, test_df], sort=False).reset_index(drop=True)


# Convert the date column to a processable format 
date_start = pd.to_datetime(all_df['date'], errors='coerce').min()

date_col = ['date']
for _col in date_col:
    date_col = pd.to_datetime(all_df[_col], errors='coerce')
    all_df[_col + "_year"] = date_col.dt.year.fillna(-1)
    all_df[_col + "_month"] = date_col.dt.month.fillna(-1)
    all_df[_col + "_day"] = date_col.dt.day.fillna(-1)
    all_df[_col + "_day_of_week"] = date_col.dt.dayofweek.fillna(-1)
    all_df[_col + "_week_of_year"] = date_col.dt.isocalendar().week.fillna(-1)

    all_df[_col + "_num"] = (date_col-date_start).dt.days.fillna(-1)
    #train['numday'] = (train['date']-date_start).dt.days
    all_df[_col + "_day_of_year"] = date_col.dt.dayofyear.fillna(-1)
    all_df[_col + "_day_of_year"] = np.where( (all_df[_col + "_year"]%4==0)&(all_df[_col + "_month"]>2), all_df[_col + "_day_of_year"]-1, all_df[_col + "_day_of_year"])

    all_df[_col + "_quarter"] = date_col.dt.quarter.fillna(-1)
    all_df[_col + "_is_month_start"] = date_col.dt.is_month_start.astype(int).fillna(-1)
    all_df[_col + "_is_month_end"] = date_col.dt.is_month_end.astype(int).fillna(-1)
    all_df[_col + "_is_quarter_start"] = date_col.dt.is_quarter_start.astype(int).fillna(-1)
    all_df[_col + "_is_quarter_end"] = date_col.dt.is_quarter_end.astype(int).fillna(-1)
    # all_df[_col + '_is_weekend'] = all_df['date_day_of_week'].isin([5, 6]).astype(int)
    # all_df.drop(_col, axis=1, inplace=True)

all_df['date'] = pd.to_datetime(all_df['date'])
all_df


# Apply sine and cosine transformations
#all_df['year_sin'] = all_df['date_year'] * np.sin(2 * pi * all_df['date_year'])
#all_df['year_cos'] = all_df['date_year'] * np.cos(2 * pi * all_df['date_year'])
all_df['month_sin'] = all_df['date_month'] * np.sin(2 * pi * all_df['date_month'])
all_df['month_cos'] = all_df['date_month'] * np.cos(2 * pi * all_df['date_month'])
all_df['day_sin'] = all_df['date_day'] * np.sin(2 * pi * all_df['date_day'])
all_df['day_cos'] = all_df['date_day'] * np.cos(2 * pi * all_df['date_day'])

all_df['year_sin'] = np.sin(2 * pi * all_df["date_day_of_year"])
all_df['year_cos'] = np.cos(2 * pi * all_df['date_day_of_year'])


# Replace Null values with None
all_df['holiday_name'].fillna('None', inplace=True)


# OneHotEncoding → holiday_name
enc = OneHotEncoder( sparse=False )

holiday_encoded = enc.fit_transform(all_df[['holiday_name']])
encoded_df = pd.DataFrame(holiday_encoded, columns=enc.get_feature_names_out(['holiday_name']))
all_df = pd.concat([all_df, encoded_df], axis=1)

# drop holiday_name column
all_df = all_df.drop('holiday_name', axis=1)


# LabelEncoding → warehouse column
le = preprocessing.LabelEncoder()

all_df['warehouse'] = le.fit_transform(all_df['warehouse'])

# holiday_name
# all_df['holiday_name'] = le.fit_transform(all_df['holiday_name'])


# Obtain the data for the day before or after a holiday
all_df['holiday_before'] = all_df['holiday'].shift(1).fillna(0).astype(int)
all_df['holiday_after'] = all_df['holiday'].shift(-1).fillna(0).astype(int)

# Obtain the data for the day before or after a shops_closed → It did not lead to an improvement in the MAPE score
# all_df['shops_closed_before'] = all_df['shops_closed'].shift(1).fillna(0).astype(int)
# all_df['shops_closed_after'] = all_df['shops_closed'].shift(-1).fillna(0).astype(int)

# Obtain the data for the day before or after school_holidays → It did not lead to an improvement in the MAPE score
# all_df['winter_school_holidays_before'] = all_df['winter_school_holidays'].shift(1).fillna(0).astype(int)
# all_df['winter_school_holidays_after'] = all_df['winter_school_holidays'].shift(-1).fillna(0).astype(int)
# all_df['school_holidays_before'] = all_df['school_holidays'].shift(1).fillna(0).astype(int)
# all_df['school_holidays_after'] = all_df['school_holidays'].shift(-1).fillna(0).astype(int)

# Obtain the data for the day before or after weekends → It did not lead to an improvement in the MAPE score
# all_df['weekend_before'] = all_df['date_is_weekend'].shift(1).fillna(0).astype(int)
# all_df['weekend_after'] = all_df['date_is_weekend'].shift(-1).fillna(0).astype(int)


# Convert the data back to train_df and test_df
train_df_le = all_df[~all_df['orders'].isnull()]
test_df_le = all_df[all_df['orders'].isnull()]

train_df_le = train_df_le.drop(columns=['date'], axis=1)
test_df_le = test_df_le.drop(columns=['date'], axis=1)


# # Predict the user_activity_2 using LGBM and add the results to the test dataset → It did not lead to an improvement in the MAPE score
# features = [col for col in train_df_le.columns if col not in ['orders', 'user_activity_2']]
# X_user_activity2 = train_df_le[features]
# y_user_actibity2 = train_df_le['user_activity_2']

# # LGBM for generating the user_activity_2 column
# X_ua2_train, X_ua2_val, y_ua2_train, y_ua2_val = train_test_split(X_user_activity2, y_user_actibity2, test_size=0.2, random_state=42)

# lgb_ua2_train = lgb.Dataset(X_ua2_train, y_ua2_train)
# lgb_ua2_val = lgb.Dataset(X_ua2_val, y_ua2_val, reference=lgb_ua2_train)
# params = {
#     'objective': 'regression',
#     'metric': 'rmse',
#     'verbosity': -1,
#     'seed': 42
# }
# model_user_activity = lgb.train(params, lgb_ua2_train, valid_sets=[lgb_ua2_train, lgb_ua2_val])

# # Predict and fill in the user_activity_2
# test_df_le['user_activity_2'] = model_user_activity.predict(test_df_le[features]).round()


# split train data

# Set random seed 
random_seed = 777 

X = train_df_le.drop(columns=['orders'])
y = train_df_le['orders']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1, random_state=random_seed)


# Cross validation

# Number of splits for cross-validation
n_splits = 10
kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_seed)

# Placeholders for stacking features
stacking_train = np.zeros((X_train.shape[0], 8))
stacking_test = np.zeros((X_test.shape[0],8))

# Initialize base models
lgb_model = lgb.LGBMRegressor(random_state=random_seed)
xgb_model = xgb.XGBRegressor(random_state=random_seed)
cat_model = CatBoostRegressor(silent=True, random_state=random_seed)
rf_model = RandomForestRegressor(random_state=random_seed)
lr_model = LogisticRegression(random_state=random_seed)
ad_model = AdaBoostRegressor(random_state=random_seed)
dt_model = DecisionTreeRegressor(random_state=random_seed)
gb_model = GradientBoostingRegressor(random_state=random_seed)

# Train base models with cross-validation
for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    # Train each base model
    lgb_model.fit(X_tr, y_tr)
    xgb_model.fit(X_tr, y_tr)
    cat_model.fit(X_tr, y_tr)
    rf_model.fit(X_tr, y_tr)
    lr_model.fit(X_tr, y_tr)
    ad_model.fit(X_tr, y_tr)
    dt_model.fit(X_tr, y_tr)
    gb_model.fit(X_tr, y_tr)

    # Predict on validation set
    stacking_train[val_idx, 0] = lgb_model.predict(X_val)
    stacking_train[val_idx, 1] = xgb_model.predict(X_val)
    stacking_train[val_idx, 2] = cat_model.predict(X_val)
    stacking_train[val_idx, 3] = rf_model.predict(X_val)
    stacking_train[val_idx, 4] = lr_model.predict(X_val)
    stacking_train[val_idx, 5] = ad_model.predict(X_val)
    stacking_train[val_idx, 6] = dt_model.predict(X_val)
    stacking_train[val_idx, 7] = gb_model.predict(X_val)

    # Predict on test set
    stacking_test[:, 0] += lgb_model.predict(X_test) / n_splits
    stacking_test[:, 1] += xgb_model.predict(X_test) / n_splits
    stacking_test[:, 2] += cat_model.predict(X_test) / n_splits
    stacking_test[:, 3] += rf_model.predict(X_test) / n_splits
    stacking_test[:, 4] += lr_model.predict(X_test) / n_splits
    stacking_test[:, 5] += ad_model.predict(X_test) / n_splits
    stacking_test[:, 6] += dt_model.predict(X_test) / n_splits
    stacking_test[:, 7] += gb_model.predict(X_test) / n_splits
    

# Train meta-model
# meta_model = LGBMRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
# meta_model =
meta_model_1 = LGBMRegressor(n_estimators=150, num_leaves=15, learning_rate=0.05
, colsample_bytree=0.6, lambda_l1=0.2, lambda_l2=0.2, random_state=random_seed)
meta_model_2 = CatBoostRegressor(verbose=0, random_state = random_seed)
meta_model_3 = XGBRegressor(random_state = random_seed)

meta_model_1.fit(stacking_train, y_train)
meta_model_2.fit(stacking_train, y_train)
meta_model_3.fit(stacking_train, y_train)

best_iteration_1 = meta_model_1.best_iteration_
best_iteration_2 = meta_model_2.best_iteration_
# Predict on test set using meta-model
meta_pred_1 = meta_model_1.predict(stacking_test)
meta_pred_2 = meta_model_2.predict(stacking_test)
meta_pred_3 = meta_model_3.predict(stacking_test)

[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001538 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 980
[LightGBM] [Info] Number of data points in the train set: 6605, number of used features: 26
[LightGBM] [Info] Start training from score 5540.911128
[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002439 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 983
[LightGBM] [Info] Number of data points in the train set: 6605, number of used features: 27
[LightGBM] [Info] Start training from score 5534.855261
[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003750 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 983
[LightGBM] [Info] Number of data points in the train set: 6605, number of used features: 27
[LightGBM] [Info] Start training from score 5533.542316
[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.083333 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 985
[LightGBM] [Info] Number of data points in the train set: 6605, number of used features: 27
[LightGBM] [Info] Start training from score 5534.330507
[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003873 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 980
[LightGBM] [Info] Number of data points in the train set: 6605, number of used features: 27
[LightGBM] [Info] Start training from score 5528.689780
[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002215 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 983
[LightGBM] [Info] Number of data points in the train set: 6605, number of used features: 27
[LightGBM] [Info] Start training from score 5544.863588
[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003532 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 982
[LightGBM] [Info] Number of data points in the train set: 6605, number of used features: 26
[LightGBM] [Info] Start training from score 5551.502044
[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003888 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 983
[LightGBM] [Info] Number of data points in the train set: 6605, number of used features: 27
[LightGBM] [Info] Start training from score 5526.702195
[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003498 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 980
[LightGBM] [Info] Number of data points in the train set: 6605, number of used features: 26
[LightGBM] [Info] Start training from score 5533.006964
[LightGBM] [Warning] Found whitespace in feature_names, replace with underlines
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003799 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 986
[LightGBM] [Info] Number of data points in the train set: 6606, number of used features: 27
[LightGBM] [Info] Start training from score 5526.058734
[LightGBM] [Warning] lambda_l2 is set=0.2, reg_lambda=0.0 will be ignored. Current value: lambda_l2=0.2
[LightGBM] [Warning] lambda_l1 is set=0.2, reg_alpha=0.0 will be ignored. Current value: lambda_l1=0.2
[LightGBM] [Warning] lambda_l2 is set=0.2, reg_lambda=0.0 will be ignored. Current value: lambda_l2=0.2
[LightGBM] [Warning] lambda_l1 is set=0.2, reg_alpha=0.0 will be ignored. Current value: lambda_l1=0.2
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002563 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1904
[LightGBM] [Info] Number of data points in the train set: 7339, number of used features: 8
[LightGBM] [Info] Start training from score 5535.446110
[LightGBM] [Warning] lambda_l2 is set=0.2, reg_lambda=0.0 will be ignored. Current value: lambda_l2=0.2
[LightGBM] [Warning] lambda_l1 is set=0.2, reg_alpha=0.0 will be ignored. Current value: lambda_l1=0.2


# # Evaluate results

# weights = {
#     'cat_test_preds': 0.55,  
#     'lgb_test_preds': 0.45,  
# }

# cat_test_preds_weighted = meta_pred_2 * weights['cat_test_preds']
# lgb_test_preds_weighted = meta_pred_1 * weights['lgb_test_preds']

# ensemble_preds = cat_test_preds_weighted + lgb_test_preds_weighted
# mape = mean_absolute_percentage_error(y_test, ensemble_preds)
# print(f'Simple Average Ensemble Model MAPE: {mape:.4f}')


# Prediction
# test_df_le = test_df_le.drop(columns=['date', 'orders'])
test_df_le = test_df_le.drop(columns=['orders'])

lgb_pred_test = lgb_model.predict(test_df_le)
xgb_pred_test = xgb_model.predict(test_df_le)
cat_pred_test = cat_model.predict(test_df_le)
rf_pred_test = rf_model.predict(test_df_le)
lr_pred_test = lr_model.predict(test_df_le)
ad_pred_test = ad_model.predict(test_df_le)
dt_pred_test = dt_model.predict(test_df_le)
gb_pred_test = gb_model.predict(test_df_le)

# stacking_test_df_le = np.vstack([lgb_pred_test, xgb_pred_test, cat_pred_test, rf_pred_test]).T
stacking_test_df_le = np.vstack([lgb_pred_test, xgb_pred_test, cat_pred_test, rf_pred_test, lr_pred_test, ad_pred_test, dt_pred_test, gb_pred_test]).T

submit_pred_1 = meta_model_1.predict(stacking_test_df_le)
submit_pred_2 = meta_model_2.predict(stacking_test_df_le)
submit_pred_3 = meta_model_3.predict(stacking_test_df_le)
weights = {
    'cat_test_preds': 1/3,  
    'lgb_test_preds': 1/3,  
    'xgb_test_preds': 1/3,
}

cat_test_preds_weighted = submit_pred_2 * weights['cat_test_preds']
lgb_test_preds_weighted = submit_pred_1 * weights['lgb_test_preds']
xgb_test_preds_weighted = submit_pred_3 * weights['xgb_test_preds']
submit_pred = cat_test_preds_weighted + lgb_test_preds_weighted + xgb_test_preds_weighted

[LightGBM] [Warning] lambda_l2 is set=0.2, reg_lambda=0.0 will be ignored. Current value: lambda_l2=0.2
[LightGBM] [Warning] lambda_l1 is set=0.2, reg_alpha=0.0 will be ignored. Current value: lambda_l1=0.2


submission = pd.DataFrame({
    'id': test_id,
    'Target': submit_pred
})

# Save
submission.to_csv('submission.csv', index=False)

print(submission)

                        id        Target
0      Prague_1_2024-03-16  10407.202067
1      Prague_1_2024-03-17  10097.461219
2      Prague_1_2024-03-18   9977.006335
3      Prague_1_2024-03-19   9534.993255
4      Prague_1_2024-03-20   9486.571774
..                     ...           ...
392  Budapest_1_2024-05-11   6951.808814
393  Budapest_1_2024-05-12   6536.162800
394  Budapest_1_2024-05-13   6615.462011
395  Budapest_1_2024-05-14   6712.757421
396  Budapest_1_2024-05-15   6516.652391

[397 rows x 2 columns]

	warehouse	date	orders	holiday_name	precipitation	user_activity_1	user_activity_2	id
0	Prague_1	2020-12-05	6895.0	NaN	0.0	1722.0	32575.0	Prague_1_2020-12-05
1	Prague_1	2020-12-06	6584.0	NaN	0.0	1688.0	32507.0	Prague_1_2020-12-06
2	Prague_1	2020-12-07	7030.0	NaN	0.0	1696.0	32552.0	Prague_1_2020-12-07
3	Prague_1	2020-12-08	6550.0	NaN	0.8	1681.0	32423.0	Prague_1_2020-12-08
4	Prague_1	2020-12-09	6910.0	NaN	0.5	1704.0	32410.0	Prague_1_2020-12-09

Preprocessing¶

Modeling (Ensemble + Stacking)¶

Submit¶

	warehouse	date	holiday_name	id
0	Prague_1	2024-03-16	NaN	Prague_1_2024-03-16
1	Prague_1	2024-03-17	NaN	Prague_1_2024-03-17
2	Prague_1	2024-03-18	NaN	Prague_1_2024-03-18
3	Prague_1	2024-03-19	NaN	Prague_1_2024-03-19
4	Prague_1	2024-03-20	NaN	Prague_1_2024-03-20