weekly predicting solar power at neom

Ayachips

September 15, 2019

This is the jupyter notebook from our JunctionX kaust hackaton (weekly firecast), source is here

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import linear_model, ensemble, metrics, model_selection, preprocessing
import joblib
%matplotlib inline

Predicting Solar Power Output at NEOM

neom_data = (pd.read_csv("../data/raw/neom-data.csv", parse_dates=[0])
               .rename(columns={"Unnamed: 0": "Timestamp"})
               .set_index("Timestamp", drop=True, inplace=False))
neom_data.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 96432 entries, 2008-01-01 00:00:00 to 2018-12-31 23:00:00
Data columns (total 12 columns):
mslp(hPa)          96432 non-null float64
t2(C)              96432 non-null float64
td2(C)             96432 non-null float64
wind_speed(m/s)    96432 non-null float64
wind_dir(Deg)      96432 non-null float64
rh(%)              96432 non-null float64
GHI(W/m2)          96432 non-null float64
SWDIR(W/m2)        96432 non-null float64
SWDNI(W/m2)        96432 non-null float64
SWDIF(W/m2)        96432 non-null float64
rain(mm)           96432 non-null float64
AOD                96432 non-null float64
dtypes: float64(12)
memory usage: 9.6 MB
neom_data.head()
mslp(hPa) t2(C) td2(C) wind_speed(m/s) wind_dir(Deg) rh(%) GHI(W/m2) SWDIR(W/m2) SWDNI(W/m2) SWDIF(W/m2) rain(mm) AOD
Timestamp
2008-01-01 00:00:00 1012.751 14.887 2.606 2.669 105.078 43.686 0.0 0.0 0.0 0.0 0.0 0.098
2008-01-01 01:00:00 1012.917 14.429 3.363 2.667 106.699 47.442 0.0 0.0 0.0 0.0 0.0 0.098
2008-01-01 02:00:00 1012.966 14.580 3.778 3.341 112.426 48.357 0.0 0.0 0.0 0.0 0.0 0.098
2008-01-01 03:00:00 1013.247 14.390 3.507 3.141 102.371 48.125 0.0 0.0 0.0 0.0 0.0 0.098
2008-01-01 04:00:00 1013.083 14.388 3.869 3.607 111.300 49.295 0.0 0.0 0.0 0.0 0.0 0.098
neom_data.tail()
mslp(hPa) t2(C) td2(C) wind_speed(m/s) wind_dir(Deg) rh(%) GHI(W/m2) SWDIR(W/m2) SWDNI(W/m2) SWDIF(W/m2) rain(mm) AOD
Timestamp
2018-12-31 19:00:00 1019.779 14.653 4.380 3.587 25.919 50.340 0.0 0.0 0.0 0.0 0.0 0.098
2018-12-31 20:00:00 1019.578 13.965 2.853 2.836 35.203 47.381 0.0 0.0 0.0 0.0 0.0 0.098
2018-12-31 21:00:00 1019.172 13.624 1.923 1.922 85.974 45.275 0.0 0.0 0.0 0.0 0.0 0.098
2018-12-31 22:00:00 1018.610 13.918 1.512 2.512 103.656 43.211 0.0 0.0 0.0 0.0 0.0 0.098
2018-12-31 23:00:00 1018.611 13.442 0.733 3.146 91.084 41.836 0.0 0.0 0.0 0.0 0.0 0.098
neom_data.describe()
mslp(hPa) t2(C) td2(C) wind_speed(m/s) wind_dir(Deg) rh(%) GHI(W/m2) SWDIR(W/m2) SWDNI(W/m2) SWDIF(W/m2) rain(mm) AOD
count 96432.000000 96432.000000 96432.000000 96432.000000 96432.000000 96432.000000 96432.000000 96432.000000 96432.000000 96432.000000 96432.000000 96432.000000
mean 1010.110794 24.896298 11.045605 3.991582 164.200525 46.168410 274.757261 211.082623 331.746291 63.674490 0.009041 0.098086
std 5.613583 6.382410 7.153472 2.485326 102.793404 17.874776 355.287896 296.287340 390.765915 91.856426 0.173081 0.000805
min 996.378000 4.571000 -22.946000 0.076000 0.672000 5.708000 0.000000 0.000000 0.000000 0.000000 -0.037000 0.096000
25% 1005.539750 20.221000 5.889750 2.152000 62.935500 32.173000 0.000000 0.000000 0.000000 0.000000 0.000000 0.098000
50% 1010.050000 25.421000 11.324500 3.437000 149.692000 44.200000 0.000000 0.000000 0.000000 0.000000 0.000000 0.098000
75% 1014.316000 29.466000 16.581250 5.342000 265.977750 58.859000 579.205250 429.275500 788.745750 121.765250 0.000000 0.099000
max 1029.022000 44.186000 27.196000 16.716000 359.620000 99.929000 1103.190000 954.562000 989.816000 856.685000 14.038000 0.100000
_ = neom_data.hist(bins=50, figsize=(20,15))

png

_hour = (neom_data.index
                  .hour
                  .rename("hour"))

hourly_averages = (neom_data.groupby(_hour)
                            .mean())

fig, ax = plt.subplots(1, 1)
_targets = ["GHI(W/m2)", "SWDIR(W/m2)", "SWDNI(W/m2)", "SWDIF(W/m2)"]
(hourly_averages.loc[:, _targets]
                .plot(ax=ax))
_ = ax.set_ylabel(r"$W/m^2$", rotation="horizontal")

png

months = (neom_data.index
                   .month
                   .rename("month"))
hours = (neom_data.index
                  .hour
                  .rename("hour"))

hourly_averages_by_month = (neom_data.groupby([months, hours])
                                     .mean())
fig, axes = plt.subplots(2, 6, sharex=True, sharey=True, figsize=(12, 6))

for month in months.unique():
    if month <= 6:
        (hourly_averages_by_month.loc[month, _targets]
                                 .plot(ax=axes[0, month - 1], legend=False))
        _ = axes[0, month - 1].set_title(month)
    else:
        (hourly_averages_by_month.loc[month, _targets]
                                 .plot(ax=axes[1, month - 7], legend=False))
        _ = axes[1, month - 7].set_title(month)
    
    if month - 1 == 0: 
        _ = axes[0, 0].set_ylabel(r"$W/m^2$")
    if month - 7 == 0: 
        _ = axes[1, 0].set_ylabel(r"$W/m^2$")
   

png

Feature Engineering

neom_data.index.dayofweek.unique()
Int64Index([1, 2, 3, 4, 5, 6, 0], dtype='int64', name='Timestamp')
neom_data.index.weekofyear.unique()
Int64Index([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
            18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
            35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
            52, 53],
           dtype='int64', name='Timestamp')
_dropped_cols = ["SWDIR(W/m2)", "SWDNI(W/m2)", "SWDIF(W/m2)"]

_year = (neom_data.index
                  .year)
_month = (neom_data.index
                   .month)
_week = (neom_data.index
                  .weekofyear)
_day = (neom_data.index
                 .dayofweek)
_hour = (neom_data.index
                  .hour)

features = (neom_data.drop(_dropped_cols, axis=1, inplace=False)
                     .assign(year=_year, month=_month, week=_week, day=_day, hour=_hour)
                     .groupby(["year", "month", "week", "day", "hour"])
                     .mean()
                     .unstack(level=["day", "hour"])
                     .reset_index(inplace=False)
                     .sort_index(axis=1)
                     .drop("year", axis=1, inplace=False)
                     .fillna(method="bfill", limit=2, inplace=False))
# want to predict the next 24 hours of "solar power"
efficiency_factor = 0.5

# square meters of solar cells required to generate 20 GW (231000 m2 will generate 7mW)
m2_of_solar_cells_required = 660000

target = (features.loc[:, ["GHI(W/m2)"]]
                  .mul(efficiency_factor)
                  .shift(-1)
                  .rename(columns={"GHI(W/m2)": "target(W/m2)"}))
target
target(W/m2)
day 0 ... 6
hour 0 1 2 3 4 5 6 7 8 9 ... 14 15 16 17 18 19 20 21 22 23
0 0.0 0.0 0.0 0.0 0.0 15.4625 85.7540 199.4865 279.1695 328.9370 ... 103.1305 4.8625 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0 0.0 22.0325 128.9640 233.4150 315.5565 366.3655 ... 74.5395 7.7545 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0 9.2310 71.1300 97.3875 65.3585 102.5280 ... 119.8195 17.9220 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0 0.0 24.6660 129.5175 233.2570 316.0465 368.8395 ... 136.2110 27.3505 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0 0.0 30.6040 140.9780 248.4170 334.1340 388.3575 ... 136.2110 27.3505 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
683 0.0 0.0 0.0 0.0 0.0 26.1075 98.4485 163.9655 226.5005 224.1960 ... 59.4640 0.0000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
684 0.0 0.0 0.0 0.0 0.0 33.3755 130.9790 222.7225 291.9230 331.0915 ... 46.6540 0.0000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
685 0.0 0.0 0.0 0.0 0.0 21.9080 122.6125 214.2285 285.8000 328.2050 ... 68.8560 0.0000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
686 0.0 0.0 0.0 0.0 0.0 21.6690 114.3185 214.0540 286.2330 335.3290 ... 75.8040 0.0000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
687 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

688 rows × 168 columns

input_data = (features.join(target)
                      .dropna(how="any", inplace=False)
                      .sort_index(axis=1))
input_data
AOD ... wind_speed(m/s)
day 0 ... 6
hour 0 1 2 3 4 5 6 7 8 9 ... 14 15 16 17 18 19 20 21 22 23
0 0.097 0.097 0.097 0.097 0.097 0.097 0.097 0.097 0.097 0.097 ... 8.923 8.388 9.168 8.932 8.275 7.644 7.218 7.145 7.254 7.519
1 0.097 0.097 0.097 0.097 0.097 0.097 0.097 0.097 0.097 0.097 ... 9.678 8.897 9.666 10.197 10.668 11.314 11.842 11.067 10.485 10.239
2 0.097 0.097 0.097 0.097 0.097 0.097 0.097 0.097 0.096 0.096 ... 2.601 1.350 0.610 0.728 0.907 3.140 2.474 2.689 1.970 2.567
3 0.097 0.097 0.097 0.097 0.097 0.097 0.097 0.097 0.097 0.097 ... 4.836 4.340 3.016 2.484 2.084 2.861 4.209 3.804 3.431 3.677
4 0.097 0.097 0.097 0.097 0.097 0.097 0.097 0.097 0.097 0.097 ... 6.908 5.829 6.632 8.176 8.604 8.674 8.467 7.488 6.633 5.573
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
682 0.097 0.097 0.097 0.097 0.097 0.097 0.097 0.097 0.097 0.097 ... 2.619 1.129 1.292 1.148 0.652 2.440 2.516 2.264 1.588 1.944
683 0.098 0.098 0.098 0.098 0.098 0.098 0.098 0.098 0.098 0.098 ... 2.619 1.129 1.292 1.148 0.652 2.440 2.516 2.264 1.588 1.944
684 0.098 0.098 0.098 0.098 0.098 0.098 0.098 0.098 0.098 0.098 ... 3.986 3.138 2.724 2.075 2.061 3.872 3.611 3.181 2.318 1.720
685 0.097 0.097 0.097 0.097 0.097 0.097 0.097 0.097 0.097 0.097 ... 2.699 1.082 1.608 2.728 2.096 0.757 0.874 2.458 3.756 3.466
686 0.097 0.097 0.097 0.097 0.097 0.097 0.097 0.097 0.097 0.097 ... 1.157 1.639 4.840 5.091 4.948 5.264 5.263 4.680 4.376 3.756

687 rows × 1682 columns

Train, Test Split

# use first 10 years for training data...
training_data = input_data.loc[:10 * 53]

# ...next two years for validation data...
validation_data = input_data.loc[10 * 53 + 1:12 * 53]

# ...and final year for testing data!
testing_data = input_data.loc[12 * 53 + 1:]
training_data.shape
(531, 1682)
validation_data.shape
(106, 1682)
testing_data.shape
(50, 1682)

Preprocessing the training and validation data

def preprocess_features(df: pd.DataFrame) -> pd.DataFrame:
    _numeric_features = ["GHI(W/m2)",
                         "mslp(hPa)",
                         "rain(mm)",
                         "rh(%)",
                         "t2(C)",
                         "td2(C)",
                         "wind_dir(Deg)",
                         "wind_speed(m/s)"]

    _ordinal_features = ["AOD",
                         "day",
                         "month",
                         "year"]

    standard_scalar = preprocessing.StandardScaler()
    Z0 = standard_scalar.fit_transform(df.loc[:, _numeric_features])
    
    ordinal_encoder = preprocessing.OrdinalEncoder()
    Z1 = ordinal_encoder.fit_transform(df.loc[:, _ordinal_features])
    transformed_features = np.hstack((Z0, Z1))
    
    return transformed_features


training_features = training_data.drop("target(W/m2)", axis=1, inplace=False)
training_target = training_data.loc[:, ["target(W/m2)"]]
transformed_training_features = preprocess_features(training_features)

validation_features = validation_data.drop("target(W/m2)", axis=1, inplace=False)
validation_target = validation_data.loc[:, ["target(W/m2)"]]
transformed_validation_features = preprocess_features(validation_features)

Find a few models that seem to work well

Linear Regression

# training a liner regression model
linear_regression = linear_model.LinearRegression()
linear_regression.fit(transformed_training_features, training_target)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
# measure training error
_predictions = linear_regression.predict(transformed_training_features)
np.sqrt(metrics.mean_squared_error(training_target, _predictions))
3.5984482821003514e-13
# measure training error
_predictions = linear_regression.predict(transformed_validation_features)
np.sqrt(metrics.mean_squared_error(validation_target, _predictions))
41.65348268671971
# user requests forecast for some week
user_forecast_request = transformed_validation_features[[-1], :]
user_forecast_response = linear_regression.predict(user_forecast_request)[0]
actual_values_response = validation_target.values[[-1], :][0]

# this would be rendered in Tableau!
plt.plot(user_forecast_response, label="predicted")
plt.plot(actual_values_response, label="actual")
plt.legend()
plt.title("Weekly forecast")
plt.ylabel("Solar Power (W/m2)")
plt.xlabel("Hour")
Text(0.5, 0, 'Hour')

png

Linear regression is overfitting!

MultiTask ElasticNet Regression

# training a multi-task elastic net model
_prng = np.random.RandomState(42)
elastic_net = linear_model.MultiTaskElasticNet(random_state=_prng)
elastic_net.fit(transformed_training_features, training_target)
/Users/pughdr/Research/junctionx-kaust-2019/env/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:1803: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 3211704.2181589045, tolerance: 24372.85566059234
  check_random_state(self.random_state), random)





MultiTaskElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
                    max_iter=1000, normalize=False,
                    random_state=<mtrand.RandomState object at 0x1a2be47f30>,
                    selection='cyclic', tol=0.0001, warm_start=False)
# measure training error
_predictions = elastic_net.predict(transformed_training_features)
np.sqrt(metrics.mean_squared_error(training_target, _predictions))
12.62446635442459
# measure validation error
_predictions = elastic_net.predict(transformed_validation_features)
np.sqrt(metrics.mean_squared_error(validation_target, _predictions))
21.755388481701576
user_forecast_request = transformed_validation_features[[-1], :]
user_forecast_response = elastic_net.predict(user_forecast_request)[0]
actual_values_response = validation_target.values[[-1], :][0]

# this would be rendered in Tableau!
plt.plot(user_forecast_response, label="predicted")
plt.plot(actual_values_response, label="actual")
plt.legend()
plt.title("Weekly forecast")
plt.ylabel("Solar Power (W/m2)")
plt.xlabel("Hour")
Text(0.5, 0, 'Hour')

png

MultiTask ElasticNet is still underfitting with default values but does significantly better than plain linear regression.

MultiTask Lasso Regression

# training a multi-task lasso model
_prng = np.random.RandomState(42)
lasso_regression = linear_model.MultiTaskLasso(random_state=_prng)
lasso_regression.fit(transformed_training_features, training_target)
/Users/pughdr/Research/junctionx-kaust-2019/env/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:1803: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 344470.44411674794, tolerance: 24372.85566059234
  check_random_state(self.random_state), random)





MultiTaskLasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
               normalize=False,
               random_state=<mtrand.RandomState object at 0x1a2cb81c18>,
               selection='cyclic', tol=0.0001, warm_start=False)
# measure training error
_predictions = lasso_regression.predict(transformed_training_features)
np.sqrt(metrics.mean_squared_error(training_target, _predictions))
9.142156744519834
# measure validation error
_predictions = lasso_regression.predict(transformed_validation_features)
np.sqrt(metrics.mean_squared_error(validation_target, _predictions))
25.11003195102284
user_forecast_request = transformed_validation_features[[-1], :]
user_forecast_response = lasso_regression.predict(user_forecast_request)[0]
actual_values_response = validation_target.values[[-1], :][0]

# this would be rendered in Tableau!
plt.plot(user_forecast_response, label="predicted")
plt.plot(actual_values_response, label="actual")
plt.legend()
plt.title("1 January 2016")
plt.ylabel("Solar Power (W/m2)")
plt.xlabel("Hour")
Text(0.5, 0, 'Hour')

png

Lasso Regression is over-fitting.

Random Forest Regression

_prng = np.random.RandomState(42)
random_forest_regressor = ensemble.RandomForestRegressor(n_estimators=100, random_state=_prng, n_jobs=2)
random_forest_regressor.fit(transformed_training_features, training_target)
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
                      oob_score=False,
                      random_state=<mtrand.RandomState object at 0x1a2cc2bbd0>,
                      verbose=0, warm_start=False)
# measure training error
_predictions = random_forest_regressor.predict(transformed_training_features)
np.sqrt(metrics.mean_squared_error(training_target, _predictions))
7.011236706071975
# measure validation error
_predictions = random_forest_regressor.predict(transformed_validation_features)
np.sqrt(metrics.mean_squared_error(validation_target, _predictions))
19.32772745697949
user_forecast_request = transformed_validation_features[[-1], :]
user_forecast_response = random_forest_regressor.predict(user_forecast_request)[0]
actual_values_response = validation_target.values[[-1], :][0]

# this would be rendered in Tableau!
plt.plot(user_forecast_response, label="predicted")
plt.plot(actual_values_response, label="actual")
plt.legend()
plt.title("Weekly Forecast")
plt.ylabel("Solar Power (W/m2)")
plt.xlabel("Hour")
Text(0.5, 0, 'Hour')

png

Random Forest with default parameters is over-fitting and needs to be regularized.

Tuning hyper-parameters

from scipy import stats

MultiTask ElasticNet Regression

_prng = np.random.RandomState(42)

_param_distributions = {
    "l1_ratio": stats.uniform(),
    "alpha": stats.lognorm(s=1),
}

elastic_net_randomized_search = model_selection.RandomizedSearchCV(
    elastic_net,
    param_distributions=_param_distributions,
    scoring="neg_mean_squared_error",
    random_state=_prng,
    n_iter=10,
    cv=8,
    n_jobs=2,
    verbose=10
)

elastic_net_randomized_search.fit(transformed_training_features, training_target)
Fitting 8 folds for each of 10 candidates, totalling 80 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:  2.8min
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:  5.5min



---------------------------------------------------------------------------

KeyboardInterrupt                         Traceback (most recent call last)

<ipython-input-94-47f8877f33d1> in <module>
     17 )
     18 
---> 19 elastic_net_randomized_search.fit(transformed_training_features, training_target)


~/Research/junctionx-kaust-2019/env/lib/python3.6/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
    685                 return results
    686 
--> 687             self._run_search(evaluate_candidates)
    688 
    689         # For multi-metric evaluation, store the best_index_, best_params_ and


~/Research/junctionx-kaust-2019/env/lib/python3.6/site-packages/sklearn/model_selection/_search.py in _run_search(self, evaluate_candidates)
   1466         evaluate_candidates(ParameterSampler(
   1467             self.param_distributions, self.n_iter,
-> 1468             random_state=self.random_state))


~/Research/junctionx-kaust-2019/env/lib/python3.6/site-packages/sklearn/model_selection/_search.py in evaluate_candidates(candidate_params)
    664                                for parameters, (train, test)
    665                                in product(candidate_params,
--> 666                                           cv.split(X, y, groups)))
    667 
    668                 if len(out) < 1:


~/Research/junctionx-kaust-2019/env/lib/python3.6/site-packages/joblib/parallel.py in __call__(self, iterable)
    932 
    933             with self._backend.retrieval_context():
--> 934                 self.retrieve()
    935             # Make sure that we get a last message telling us we are done
    936             elapsed_time = time.time() - self._start_time


~/Research/junctionx-kaust-2019/env/lib/python3.6/site-packages/joblib/parallel.py in retrieve(self)
    831             try:
    832                 if getattr(self._backend, 'supports_timeout', False):
--> 833                     self._output.extend(job.get(timeout=self.timeout))
    834                 else:
    835                     self._output.extend(job.get())


~/Research/junctionx-kaust-2019/env/lib/python3.6/site-packages/joblib/_parallel_backends.py in wrap_future_result(future, timeout)
    519         AsyncResults.get from multiprocessing."""
    520         try:
--> 521             return future.result(timeout=timeout)
    522         except LokyTimeoutError:
    523             raise TimeoutError()


~/Research/junctionx-kaust-2019/env/lib/python3.6/concurrent/futures/_base.py in result(self, timeout)
    425                 return self.__get_result()
    426 
--> 427             self._condition.wait(timeout)
    428 
    429             if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:


~/Research/junctionx-kaust-2019/env/lib/python3.6/threading.py in wait(self, timeout)
    293         try:    # restore state no matter what (e.g., KeyboardInterrupt)
    294             if timeout is None:
--> 295                 waiter.acquire()
    296                 gotit = True
    297             else:


KeyboardInterrupt: 
_ = joblib.dump(elastic_net_randomized_search.best_estimator_,
                "../models/weekly/tuned-elasticnet-regression-model.pkl")
elastic_net_randomized_search.best_estimator_
MultiTaskElasticNet(alpha=2.154232968599504, copy_X=True, fit_intercept=True,
                    l1_ratio=0.9699098521619943, max_iter=1000, normalize=False,
                    random_state=<mtrand.RandomState object at 0x1a48e31558>,
                    selection='cyclic', tol=0.0001, warm_start=False)
(-elastic_net_randomized_search.best_score_)**0.5
18.355092813714375
user_forecast_request = transformed_validation_features[[-1], :]
user_forecast_response = elastic_net_randomized_search.predict(user_forecast_request)[0]
actual_values_response = validation_target.values[[-1], :][0]

# this would be rendered in Tableau!
plt.plot(user_forecast_response, label="predicted")
plt.plot(actual_values_response, label="actual")
plt.legend()
plt.title("Typical weekyl forecast")
plt.ylabel("Solar Power (W/m2)")
plt.xlabel("Hour")
Text(0.5, 0, 'Hour')

png

MultiTask Lasso Regression

_prng = np.random.RandomState(42)

_param_distributions = {
    "alpha": stats.lognorm(s=1),
}

lasso_regression_randomized_search = model_selection.RandomizedSearchCV(
    lasso_regression,
    param_distributions=_param_distributions,
    scoring="neg_mean_squared_error",
    random_state=_prng,
    n_iter=10,
    cv=8,
    n_jobs=2,
    verbose=10
)

lasso_regression_randomized_search.fit(transformed_training_features, training_target)
Fitting 8 folds for each of 10 candidates, totalling 80 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:    7.0s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:    9.4s
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:   16.5s
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:   22.5s
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed:   30.3s
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:   38.4s
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:   48.6s
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:  1.0min
[Parallel(n_jobs=2)]: Done  57 tasks      | elapsed:  1.2min
[Parallel(n_jobs=2)]: Done  68 tasks      | elapsed:  1.5min
[Parallel(n_jobs=2)]: Done  80 out of  80 | elapsed:  1.8min finished





RandomizedSearchCV(cv=8, error_score='raise-deprecating',
                   estimator=MultiTaskLasso(alpha=1.0, copy_X=True,
                                            fit_intercept=True, max_iter=1000,
                                            normalize=False,
                                            random_state=<mtrand.RandomState object at 0x1a649d5870>,
                                            selection='cyclic', tol=0.0001,
                                            warm_start=False),
                   iid='warn', n_iter=10, n_jobs=2,
                   param_distributions={'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1a7af2b470>},
                   pre_dispatch='2*n_jobs',
                   random_state=<mtrand.RandomState object at 0x1a69473438>,
                   refit=True, return_train_score=False,
                   scoring='neg_mean_squared_error', verbose=10)
_ = joblib.dump(lasso_regression_randomized_search.best_estimator_,
                "../models/weekly/tuned-lasso-regression-model.pkl")
(-lasso_regression_randomized_search.best_score_)**0.5
17.956920842745834
user_forecast_request = transformed_validation_features[-7:, :]
user_forecast_response = lasso_regression_randomized_search.predict(user_forecast_request)
actual_values_response = validation_target.values[-7:, :]

# this would be rendered in Tableau!
fig, ax = plt.subplots(1, 1, figsize=(12, 8))
ax.plot(user_forecast_response.flatten(), label="predicted")
ax.plot(actual_values_response.flatten(), label="actual")
ax.legend()
ax.set_title("Last week of December 2015")
ax.set_ylabel("Solar Power (W/m2)")
ax.set_xlabel("Hours")
Text(0.5, 0, 'Hours')

png

Random Forest Regressor

_prng = np.random.RandomState(42)

_param_distributions = {
    "n_estimators": stats.geom(p=0.01),
     "min_samples_split": stats.beta(a=1, b=99),
     "min_samples_leaf": stats.beta(a=1, b=999),
}

_cv = model_selection.TimeSeriesSplit(max_train_size=None, n_splits=5)

random_forest_randomized_search = model_selection.RandomizedSearchCV(
    random_forest_regressor,
    param_distributions=_param_distributions,
    scoring="neg_mean_squared_error",
    random_state=_prng,
    n_iter=10,
    cv=3,
    n_jobs=2,
    verbose=10
)

random_forest_randomized_search.fit(transformed_training_features, training_target)
Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:  1.2min
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:  2.3min
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed: 14.0min
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed: 14.8min
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed: 15.4min
[Parallel(n_jobs=2)]: Done  30 out of  30 | elapsed: 18.4min finished





RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100, n_jobs=2,
                                                   oob_score=False,
                                                   random_state=<mt...
                   param_distributions={'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1a28d373c8>,
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1a28d375f8>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1a28d28630>},
                   pre_dispatch='2*n_jobs',
                   random_state=<mtrand.RandomState object at 0x1a2886e828>,
                   refit=True, return_train_score=False,
                   scoring='neg_mean_squared_error', verbose=10)
_ = joblib.dump(random_forest_randomized_search.best_estimator_,
                "../models/weekly/tuned-random-forest-regression-model.pkl")
random_forest_randomized_search.best_estimator_
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=0.001249828663231378,
                      min_samples_split=0.0019415164208264953,
                      min_weight_fraction_leaf=0.0, n_estimators=75, n_jobs=2,
                      oob_score=False,
                      random_state=<mtrand.RandomState object at 0x1a292509d8>,
                      verbose=0, warm_start=False)
(-random_forest_randomized_search.best_score_)**0.5
19.002834611158974
# user requests forecast for 1 January 2016 which we predict using data from 31 December 2015!
user_forecast_request = transformed_validation_features[[-1], :]
user_forecast_response = random_forest_randomized_search.predict(user_forecast_request)[0]
actual_values_response = validation_target.values[[-1], :][0]

# this would be rendered in Tableau!
plt.plot(user_forecast_response, label="predicted")
plt.plot(actual_values_response, label="actual")
plt.legend()
plt.title("Typical weekly forecast")
plt.ylabel("Solar Power (W/m2)")
plt.xlabel("Hour")

png

Assess model performance on testing data

testing_features = testing_data.drop("target(W/m2)", axis=1, inplace=False)
testing_target = testing_data.loc[:, ["target(W/m2)"]]
transformed_testing_features = preprocess_features(testing_features)
elastic_net_predictions = elastic_net_randomized_search.predict(transformed_testing_features)
np.sqrt(metrics.mean_squared_error(testing_target, elastic_net_predictions))
19.76022744338562
lasso_regression_predictions = lasso_regression_randomized_search.predict(transformed_testing_features)
np.sqrt(metrics.mean_squared_error(testing_target, lasso_regression_predictions))
19.73446980783998
# random forest wins!
random_forest_predictions = random_forest_randomized_search.predict(transformed_testing_features)
np.sqrt(metrics.mean_squared_error(testing_target, random_forest_predictions))
19.826312994581507
# user requests forecast for last week of 2018
user_forecast_request = transformed_testing_features[[-1], :]
user_forecast_response = random_forest_randomized_search.predict(user_forecast_request)[0]
actual_values_response = testing_target.values[[-1], :][0]

# this would be rendered in Tableau!
fig, ax = plt.subplots(1, 1, figsize=(12, 8))
ax.plot(user_forecast_response.flatten(), label="predicted")
ax.plot(actual_values_response.flatten(), label="actual")
ax.legend()
ax.set_title("Typical weekly forecast")
ax.set_ylabel("Solar Power (W/m2)")
ax.set_xlabel("Hours")
plt.savefig("../results/img/typical-weekly-actual-vs-predicted-solar-power.png")

png

# combine the training and validtion data
combined_training_features = pd.concat([training_features, validation_features])
transformed_combined_training_features = preprocess_features(combined_training_features)
combined_training_target = pd.concat([training_target, validation_target])

# tune a random forest regressor using CV ro avoid overfitting
_prng = np.random.RandomState(42)

_param_distributions = {
    "n_estimators": stats.geom(p=0.01),
     "min_samples_split": stats.beta(a=1, b=99),
     "min_samples_leaf": stats.beta(a=1, b=999),
}

tuned_random_forest_regressor = model_selection.RandomizedSearchCV(
    ensemble.RandomForestRegressor(n_estimators=100, random_state=_prng),
    param_distributions=_param_distributions,
    scoring="neg_mean_squared_error",
    random_state=_prng,
    n_iter=10,
    cv=5,
    n_jobs=2,
    verbose=10
)

tuned_random_forest_regressor.fit(combined_training_features, combined_training_target)
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:  2.5min
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:  4.9min
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed: 10.5min
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed: 14.2min
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed: 16.4min
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed: 17.9min
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed: 20.3min
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed: 26.2min
[Parallel(n_jobs=2)]: Done  50 out of  50 | elapsed: 28.3min finished
/Users/pughdr/Research/junctionx-kaust-2019/env/lib/python3.6/site-packages/sklearn/model_selection/_search.py:813: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)





RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                                                   n_jobs=None, oob_score=False,
                                                   random_state=...
                   param_distributions={'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1a32bef860>,
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1a32bef198>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1a32bef7f0>},
                   pre_dispatch='2*n_jobs',
                   random_state=<mtrand.RandomState object at 0x1a2cc608b8>,
                   refit=True, return_train_score=False,
                   scoring='neg_mean_squared_error', verbose=10)
tuned_random_forest_regressor.best_estimator_
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=0.0004129137216396695,
                      min_samples_split=0.004789233605970227,
                      min_weight_fraction_leaf=0.0, n_estimators=78,
                      n_jobs=None, oob_score=False,
                      random_state=<mtrand.RandomState object at 0x1a32c13948>,
                      verbose=0, warm_start=False)
(-tuned_random_forest_regressor.best_score_)**0.5
18.785573993005393
# user requests forecast for last week of 2018
user_forecast_request = transformed_testing_features[[-1], :]
user_forecast_response = random_forest_randomized_search.predict(user_forecast_request)[0]
actual_values_response = testing_target.values[[-1], :][0]

# this would be rendered in Tableau!
fig, ax = plt.subplots(1, 1, figsize=(12, 8))
ax.plot(user_forecast_response.flatten(), label="predicted")
ax.plot(actual_values_response.flatten(), label="actual")
ax.legend()
ax.set_title("Typical weekly forecast")
ax.set_ylabel("Solar Power (W/m2)")
ax.set_xlabel("Hours")
plt.savefig("../results/img/typical-weekly-actual-vs-predicted-solar-power.png")

png

Forecasting the future of solar power at NEOM

Once the model is trained, the model can generate a new forecast for next week’s solar power generation. Once actual values of solar power generation are observed, model can be automatically re-trained and improved. Model can be retrained with weekly, monthly forecast horizons if longer forecasts are required.

incoming_features = features.loc[[687]]
new_predictions = tuned_random_forest_regressor.predict(incoming_features)[0]
solar_power_forecast = (pd.DataFrame.from_dict({"Timestamp": pd.date_range(start="2019-01-01", end="2019-01-07 23:00:00", freq='H'),
                                                "Predicted Solar Power (W/m2)": new_predictions})
                          .set_index("Timestamp", inplace=False))
_ = solar_power_forecast.plot()

png