This is the jupyter notebook from our JunctionX kaust hackaton (weekly firecast), source is here

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import linear_model, ensemble, metrics, model_selection, preprocessing
import joblib

%matplotlib inline

Predicting Solar Power Output at NEOM

neom_data = (pd.read_csv("../data/raw/neom-data.csv", parse_dates=[0])
               .rename(columns={"Unnamed: 0": "Timestamp"})
               .set_index("Timestamp", drop=True, inplace=False))

neom_data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 96432 entries, 2008-01-01 00:00:00 to 2018-12-31 23:00:00
Data columns (total 12 columns):
mslp(hPa)          96432 non-null float64
t2(C)              96432 non-null float64
td2(C)             96432 non-null float64
wind_speed(m/s)    96432 non-null float64
wind_dir(Deg)      96432 non-null float64
rh(%)              96432 non-null float64
GHI(W/m2)          96432 non-null float64
SWDIR(W/m2)        96432 non-null float64
SWDNI(W/m2)        96432 non-null float64
SWDIF(W/m2)        96432 non-null float64
rain(mm)           96432 non-null float64
AOD                96432 non-null float64
dtypes: float64(12)
memory usage: 9.6 MB

neom_data.head()

	mslp(hPa)	t2(C)	td2(C)	wind_speed(m/s)	wind_dir(Deg)	rh(%)	GHI(W/m2)	SWDIR(W/m2)	SWDNI(W/m2)	SWDIF(W/m2)	rain(mm)	AOD
Timestamp
2008-01-01 00:00:00	1012.751	14.887	2.606	2.669	105.078	43.686	0.0	0.0	0.0	0.0	0.0	0.098
2008-01-01 01:00:00	1012.917	14.429	3.363	2.667	106.699	47.442	0.0	0.0	0.0	0.0	0.0	0.098
2008-01-01 02:00:00	1012.966	14.580	3.778	3.341	112.426	48.357	0.0	0.0	0.0	0.0	0.0	0.098
2008-01-01 03:00:00	1013.247	14.390	3.507	3.141	102.371	48.125	0.0	0.0	0.0	0.0	0.0	0.098
2008-01-01 04:00:00	1013.083	14.388	3.869	3.607	111.300	49.295	0.0	0.0	0.0	0.0	0.0	0.098

neom_data.tail()

	mslp(hPa)	t2(C)	td2(C)	wind_speed(m/s)	wind_dir(Deg)	rh(%)	GHI(W/m2)	SWDIR(W/m2)	SWDNI(W/m2)	SWDIF(W/m2)	rain(mm)	AOD
Timestamp
2018-12-31 19:00:00	1019.779	14.653	4.380	3.587	25.919	50.340	0.0	0.0	0.0	0.0	0.0	0.098
2018-12-31 20:00:00	1019.578	13.965	2.853	2.836	35.203	47.381	0.0	0.0	0.0	0.0	0.0	0.098
2018-12-31 21:00:00	1019.172	13.624	1.923	1.922	85.974	45.275	0.0	0.0	0.0	0.0	0.0	0.098
2018-12-31 22:00:00	1018.610	13.918	1.512	2.512	103.656	43.211	0.0	0.0	0.0	0.0	0.0	0.098
2018-12-31 23:00:00	1018.611	13.442	0.733	3.146	91.084	41.836	0.0	0.0	0.0	0.0	0.0	0.098

neom_data.describe()

	mslp(hPa)	t2(C)	td2(C)	wind_speed(m/s)	wind_dir(Deg)	rh(%)	GHI(W/m2)	SWDIR(W/m2)	SWDNI(W/m2)	SWDIF(W/m2)	rain(mm)	AOD
count	96432.000000	96432.000000	96432.000000	96432.000000	96432.000000	96432.000000	96432.000000	96432.000000	96432.000000	96432.000000	96432.000000	96432.000000
mean	1010.110794	24.896298	11.045605	3.991582	164.200525	46.168410	274.757261	211.082623	331.746291	63.674490	0.009041	0.098086
std	5.613583	6.382410	7.153472	2.485326	102.793404	17.874776	355.287896	296.287340	390.765915	91.856426	0.173081	0.000805
min	996.378000	4.571000	-22.946000	0.076000	0.672000	5.708000	0.000000	0.000000	0.000000	0.000000	-0.037000	0.096000
25%	1005.539750	20.221000	5.889750	2.152000	62.935500	32.173000	0.000000	0.000000	0.000000	0.000000	0.000000	0.098000
50%	1010.050000	25.421000	11.324500	3.437000	149.692000	44.200000	0.000000	0.000000	0.000000	0.000000	0.000000	0.098000
75%	1014.316000	29.466000	16.581250	5.342000	265.977750	58.859000	579.205250	429.275500	788.745750	121.765250	0.000000	0.099000
max	1029.022000	44.186000	27.196000	16.716000	359.620000	99.929000	1103.190000	954.562000	989.816000	856.685000	14.038000	0.100000

_ = neom_data.hist(bins=50, figsize=(20,15))

png

_hour = (neom_data.index
                  .hour
                  .rename("hour"))

hourly_averages = (neom_data.groupby(_hour)
                            .mean())

fig, ax = plt.subplots(1, 1)
_targets = ["GHI(W/m2)", "SWDIR(W/m2)", "SWDNI(W/m2)", "SWDIF(W/m2)"]
(hourly_averages.loc[:, _targets]
                .plot(ax=ax))
_ = ax.set_ylabel(r"$W/m^2$", rotation="horizontal")

png

months = (neom_data.index
                   .month
                   .rename("month"))
hours = (neom_data.index
                  .hour
                  .rename("hour"))

hourly_averages_by_month = (neom_data.groupby([months, hours])
                                     .mean())

fig, axes = plt.subplots(2, 6, sharex=True, sharey=True, figsize=(12, 6))

for month in months.unique():
    if month <= 6:
        (hourly_averages_by_month.loc[month, _targets]
                                 .plot(ax=axes[0, month - 1], legend=False))
        _ = axes[0, month - 1].set_title(month)
    else:
        (hourly_averages_by_month.loc[month, _targets]
                                 .plot(ax=axes[1, month - 7], legend=False))
        _ = axes[1, month - 7].set_title(month)
    
    if month - 1 == 0: 
        _ = axes[0, 0].set_ylabel(r"$W/m^2$")
    if month - 7 == 0: 
        _ = axes[1, 0].set_ylabel(r"$W/m^2$")

png

Feature Engineering

neom_data.index.dayofweek.unique()

Int64Index([1, 2, 3, 4, 5, 6, 0], dtype='int64', name='Timestamp')

neom_data.index.weekofyear.unique()

Int64Index([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
            18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
            35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
            52, 53],
           dtype='int64', name='Timestamp')

_dropped_cols = ["SWDIR(W/m2)", "SWDNI(W/m2)", "SWDIF(W/m2)"]

_year = (neom_data.index
                  .year)
_month = (neom_data.index
                   .month)
_week = (neom_data.index
                  .weekofyear)
_day = (neom_data.index
                 .dayofweek)
_hour = (neom_data.index
                  .hour)

features = (neom_data.drop(_dropped_cols, axis=1, inplace=False)
                     .assign(year=_year, month=_month, week=_week, day=_day, hour=_hour)
                     .groupby(["year", "month", "week", "day", "hour"])
                     .mean()
                     .unstack(level=["day", "hour"])
                     .reset_index(inplace=False)
                     .sort_index(axis=1)
                     .drop("year", axis=1, inplace=False)
                     .fillna(method="bfill", limit=2, inplace=False))

# want to predict the next 24 hours of "solar power"
efficiency_factor = 0.5

# square meters of solar cells required to generate 20 GW (231000 m2 will generate 7mW)
m2_of_solar_cells_required = 660000

target = (features.loc[:, ["GHI(W/m2)"]]
                  .mul(efficiency_factor)
                  .shift(-1)
                  .rename(columns={"GHI(W/m2)": "target(W/m2)"}))

target

	target(W/m2)
day	0										...	6
hour	0	1	2	3	4	5	6	7	8	9	...	14	15	16	17	18	19	20	21	22	23
0	0.0	0.0	0.0	0.0	0.0	15.4625	85.7540	199.4865	279.1695	328.9370	...	103.1305	4.8625	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0	0.0	22.0325	128.9640	233.4150	315.5565	366.3655	...	74.5395	7.7545	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0	0.0	9.2310	71.1300	97.3875	65.3585	102.5280	...	119.8195	17.9220	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0.0	0.0	0.0	0.0	0.0	24.6660	129.5175	233.2570	316.0465	368.8395	...	136.2110	27.3505	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	0.0	0.0	0.0	0.0	0.0	30.6040	140.9780	248.4170	334.1340	388.3575	...	136.2110	27.3505	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
683	0.0	0.0	0.0	0.0	0.0	26.1075	98.4485	163.9655	226.5005	224.1960	...	59.4640	0.0000	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
684	0.0	0.0	0.0	0.0	0.0	33.3755	130.9790	222.7225	291.9230	331.0915	...	46.6540	0.0000	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
685	0.0	0.0	0.0	0.0	0.0	21.9080	122.6125	214.2285	285.8000	328.2050	...	68.8560	0.0000	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
686	0.0	0.0	0.0	0.0	0.0	21.6690	114.3185	214.0540	286.2330	335.3290	...	75.8040	0.0000	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
687	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

688 rows × 168 columns

input_data = (features.join(target)
                      .dropna(how="any", inplace=False)
                      .sort_index(axis=1))

input_data

	AOD										...	wind_speed(m/s)
day	0										...	6
hour	0	1	2	3	4	5	6	7	8	9	...	14	15	16	17	18	19	20	21	22	23
0	0.097	0.097	0.097	0.097	0.097	0.097	0.097	0.097	0.097	0.097	...	8.923	8.388	9.168	8.932	8.275	7.644	7.218	7.145	7.254	7.519
1	0.097	0.097	0.097	0.097	0.097	0.097	0.097	0.097	0.097	0.097	...	9.678	8.897	9.666	10.197	10.668	11.314	11.842	11.067	10.485	10.239
2	0.097	0.097	0.097	0.097	0.097	0.097	0.097	0.097	0.096	0.096	...	2.601	1.350	0.610	0.728	0.907	3.140	2.474	2.689	1.970	2.567
3	0.097	0.097	0.097	0.097	0.097	0.097	0.097	0.097	0.097	0.097	...	4.836	4.340	3.016	2.484	2.084	2.861	4.209	3.804	3.431	3.677
4	0.097	0.097	0.097	0.097	0.097	0.097	0.097	0.097	0.097	0.097	...	6.908	5.829	6.632	8.176	8.604	8.674	8.467	7.488	6.633	5.573
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
682	0.097	0.097	0.097	0.097	0.097	0.097	0.097	0.097	0.097	0.097	...	2.619	1.129	1.292	1.148	0.652	2.440	2.516	2.264	1.588	1.944
683	0.098	0.098	0.098	0.098	0.098	0.098	0.098	0.098	0.098	0.098	...	2.619	1.129	1.292	1.148	0.652	2.440	2.516	2.264	1.588	1.944
684	0.098	0.098	0.098	0.098	0.098	0.098	0.098	0.098	0.098	0.098	...	3.986	3.138	2.724	2.075	2.061	3.872	3.611	3.181	2.318	1.720
685	0.097	0.097	0.097	0.097	0.097	0.097	0.097	0.097	0.097	0.097	...	2.699	1.082	1.608	2.728	2.096	0.757	0.874	2.458	3.756	3.466
686	0.097	0.097	0.097	0.097	0.097	0.097	0.097	0.097	0.097	0.097	...	1.157	1.639	4.840	5.091	4.948	5.264	5.263	4.680	4.376	3.756

687 rows × 1682 columns

Train, Test Split

# use first 10 years for training data...
training_data = input_data.loc[:10 * 53]

# ...next two years for validation data...
validation_data = input_data.loc[10 * 53 + 1:12 * 53]

# ...and final year for testing data!
testing_data = input_data.loc[12 * 53 + 1:]

training_data.shape

(531, 1682)

validation_data.shape

(106, 1682)

testing_data.shape

(50, 1682)

Preprocessing the training and validation data

def preprocess_features(df: pd.DataFrame) -> pd.DataFrame:
    _numeric_features = ["GHI(W/m2)",
                         "mslp(hPa)",
                         "rain(mm)",
                         "rh(%)",
                         "t2(C)",
                         "td2(C)",
                         "wind_dir(Deg)",
                         "wind_speed(m/s)"]

    _ordinal_features = ["AOD",
                         "day",
                         "month",
                         "year"]

    standard_scalar = preprocessing.StandardScaler()
    Z0 = standard_scalar.fit_transform(df.loc[:, _numeric_features])
    
    ordinal_encoder = preprocessing.OrdinalEncoder()
    Z1 = ordinal_encoder.fit_transform(df.loc[:, _ordinal_features])
    transformed_features = np.hstack((Z0, Z1))
    
    return transformed_features

training_features = training_data.drop("target(W/m2)", axis=1, inplace=False)
training_target = training_data.loc[:, ["target(W/m2)"]]
transformed_training_features = preprocess_features(training_features)

validation_features = validation_data.drop("target(W/m2)", axis=1, inplace=False)
validation_target = validation_data.loc[:, ["target(W/m2)"]]
transformed_validation_features = preprocess_features(validation_features)

Find a few models that seem to work well

Linear Regression

# training a liner regression model
linear_regression = linear_model.LinearRegression()
linear_regression.fit(transformed_training_features, training_target)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

# measure training error
_predictions = linear_regression.predict(transformed_training_features)
np.sqrt(metrics.mean_squared_error(training_target, _predictions))

3.5984482821003514e-13

# measure training error
_predictions = linear_regression.predict(transformed_validation_features)
np.sqrt(metrics.mean_squared_error(validation_target, _predictions))

41.65348268671971

# user requests forecast for some week
user_forecast_request = transformed_validation_features[[-1], :]
user_forecast_response = linear_regression.predict(user_forecast_request)[0]
actual_values_response = validation_target.values[[-1], :][0]

# this would be rendered in Tableau!
plt.plot(user_forecast_response, label="predicted")
plt.plot(actual_values_response, label="actual")
plt.legend()
plt.title("Weekly forecast")
plt.ylabel("Solar Power (W/m2)")
plt.xlabel("Hour")

Text(0.5, 0, 'Hour')

png

Linear regression is overfitting!

MultiTask ElasticNet Regression

# training a multi-task elastic net model
_prng = np.random.RandomState(42)
elastic_net = linear_model.MultiTaskElasticNet(random_state=_prng)
elastic_net.fit(transformed_training_features, training_target)

/Users/pughdr/Research/junctionx-kaust-2019/env/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:1803: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 3211704.2181589045, tolerance: 24372.85566059234
  check_random_state(self.random_state), random)





MultiTaskElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
                    max_iter=1000, normalize=False,
                    random_state=<mtrand.RandomState object at 0x1a2be47f30>,
                    selection='cyclic', tol=0.0001, warm_start=False)

# measure training error
_predictions = elastic_net.predict(transformed_training_features)
np.sqrt(metrics.mean_squared_error(training_target, _predictions))

12.62446635442459

# measure validation error
_predictions = elastic_net.predict(transformed_validation_features)
np.sqrt(metrics.mean_squared_error(validation_target, _predictions))

21.755388481701576

user_forecast_request = transformed_validation_features[[-1], :]
user_forecast_response = elastic_net.predict(user_forecast_request)[0]
actual_values_response = validation_target.values[[-1], :][0]

# this would be rendered in Tableau!
plt.plot(user_forecast_response, label="predicted")
plt.plot(actual_values_response, label="actual")
plt.legend()
plt.title("Weekly forecast")
plt.ylabel("Solar Power (W/m2)")
plt.xlabel("Hour")

Text(0.5, 0, 'Hour')

png

MultiTask ElasticNet is still underfitting with default values but does significantly better than plain linear regression.

MultiTask Lasso Regression

# training a multi-task lasso model
_prng = np.random.RandomState(42)
lasso_regression = linear_model.MultiTaskLasso(random_state=_prng)
lasso_regression.fit(transformed_training_features, training_target)

/Users/pughdr/Research/junctionx-kaust-2019/env/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:1803: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 344470.44411674794, tolerance: 24372.85566059234
  check_random_state(self.random_state), random)





MultiTaskLasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
               normalize=False,
               random_state=<mtrand.RandomState object at 0x1a2cb81c18>,
               selection='cyclic', tol=0.0001, warm_start=False)

# measure training error
_predictions = lasso_regression.predict(transformed_training_features)
np.sqrt(metrics.mean_squared_error(training_target, _predictions))

9.142156744519834

# measure validation error
_predictions = lasso_regression.predict(transformed_validation_features)
np.sqrt(metrics.mean_squared_error(validation_target, _predictions))

25.11003195102284

user_forecast_request = transformed_validation_features[[-1], :]
user_forecast_response = lasso_regression.predict(user_forecast_request)[0]
actual_values_response = validation_target.values[[-1], :][0]

# this would be rendered in Tableau!
plt.plot(user_forecast_response, label="predicted")
plt.plot(actual_values_response, label="actual")
plt.legend()
plt.title("1 January 2016")
plt.ylabel("Solar Power (W/m2)")
plt.xlabel("Hour")

Text(0.5, 0, 'Hour')

png

Lasso Regression is over-fitting.

Random Forest Regression

_prng = np.random.RandomState(42)
random_forest_regressor = ensemble.RandomForestRegressor(n_estimators=100, random_state=_prng, n_jobs=2)
random_forest_regressor.fit(transformed_training_features, training_target)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
                      oob_score=False,
                      random_state=<mtrand.RandomState object at 0x1a2cc2bbd0>,
                      verbose=0, warm_start=False)

# measure training error
_predictions = random_forest_regressor.predict(transformed_training_features)
np.sqrt(metrics.mean_squared_error(training_target, _predictions))

7.011236706071975

# measure validation error
_predictions = random_forest_regressor.predict(transformed_validation_features)
np.sqrt(metrics.mean_squared_error(validation_target, _predictions))

19.32772745697949

user_forecast_request = transformed_validation_features[[-1], :]
user_forecast_response = random_forest_regressor.predict(user_forecast_request)[0]
actual_values_response = validation_target.values[[-1], :][0]

# this would be rendered in Tableau!
plt.plot(user_forecast_response, label="predicted")
plt.plot(actual_values_response, label="actual")
plt.legend()
plt.title("Weekly Forecast")
plt.ylabel("Solar Power (W/m2)")
plt.xlabel("Hour")

Text(0.5, 0, 'Hour')

png

Random Forest with default parameters is over-fitting and needs to be regularized.

Tuning hyper-parameters

from scipy import stats

MultiTask ElasticNet Regression

_prng = np.random.RandomState(42)

_param_distributions = {
    "l1_ratio": stats.uniform(),
    "alpha": stats.lognorm(s=1),
}

elastic_net_randomized_search = model_selection.RandomizedSearchCV(
    elastic_net,
    param_distributions=_param_distributions,
    scoring="neg_mean_squared_error",
    random_state=_prng,
    n_iter=10,
    cv=8,
    n_jobs=2,
    verbose=10
)

elastic_net_randomized_search.fit(transformed_training_features, training_target)

Fitting 8 folds for each of 10 candidates, totalling 80 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:  2.8min
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:  5.5min



---------------------------------------------------------------------------

KeyboardInterrupt                         Traceback (most recent call last)

<ipython-input-94-47f8877f33d1> in <module>
     17 )
     18 
---> 19 elastic_net_randomized_search.fit(transformed_training_features, training_target)


~/Research/junctionx-kaust-2019/env/lib/python3.6/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
    685                 return results
    686 
--> 687             self._run_search(evaluate_candidates)
    688 
    689         # For multi-metric evaluation, store the best_index_, best_params_ and


~/Research/junctionx-kaust-2019/env/lib/python3.6/site-packages/sklearn/model_selection/_search.py in _run_search(self, evaluate_candidates)
   1466         evaluate_candidates(ParameterSampler(
   1467             self.param_distributions, self.n_iter,
-> 1468             random_state=self.random_state))


~/Research/junctionx-kaust-2019/env/lib/python3.6/site-packages/sklearn/model_selection/_search.py in evaluate_candidates(candidate_params)
    664                                for parameters, (train, test)
    665                                in product(candidate_params,
--> 666                                           cv.split(X, y, groups)))
    667 
    668                 if len(out) < 1:


~/Research/junctionx-kaust-2019/env/lib/python3.6/site-packages/joblib/parallel.py in __call__(self, iterable)
    932 
    933             with self._backend.retrieval_context():
--> 934                 self.retrieve()
    935             # Make sure that we get a last message telling us we are done
    936             elapsed_time = time.time() - self._start_time


~/Research/junctionx-kaust-2019/env/lib/python3.6/site-packages/joblib/parallel.py in retrieve(self)
    831             try:
    832                 if getattr(self._backend, 'supports_timeout', False):
--> 833                     self._output.extend(job.get(timeout=self.timeout))
    834                 else:
    835                     self._output.extend(job.get())


~/Research/junctionx-kaust-2019/env/lib/python3.6/site-packages/joblib/_parallel_backends.py in wrap_future_result(future, timeout)
    519         AsyncResults.get from multiprocessing."""
    520         try:
--> 521             return future.result(timeout=timeout)
    522         except LokyTimeoutError:
    523             raise TimeoutError()


~/Research/junctionx-kaust-2019/env/lib/python3.6/concurrent/futures/_base.py in result(self, timeout)
    425                 return self.__get_result()
    426 
--> 427             self._condition.wait(timeout)
    428 
    429             if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:


~/Research/junctionx-kaust-2019/env/lib/python3.6/threading.py in wait(self, timeout)
    293         try:    # restore state no matter what (e.g., KeyboardInterrupt)
    294             if timeout is None:
--> 295                 waiter.acquire()
    296                 gotit = True
    297             else:


KeyboardInterrupt:

_ = joblib.dump(elastic_net_randomized_search.best_estimator_,
                "../models/weekly/tuned-elasticnet-regression-model.pkl")

elastic_net_randomized_search.best_estimator_

MultiTaskElasticNet(alpha=2.154232968599504, copy_X=True, fit_intercept=True,
                    l1_ratio=0.9699098521619943, max_iter=1000, normalize=False,
                    random_state=<mtrand.RandomState object at 0x1a48e31558>,
                    selection='cyclic', tol=0.0001, warm_start=False)

(-elastic_net_randomized_search.best_score_)**0.5

18.355092813714375

user_forecast_request = transformed_validation_features[[-1], :]
user_forecast_response = elastic_net_randomized_search.predict(user_forecast_request)[0]
actual_values_response = validation_target.values[[-1], :][0]

# this would be rendered in Tableau!
plt.plot(user_forecast_response, label="predicted")
plt.plot(actual_values_response, label="actual")
plt.legend()
plt.title("Typical weekyl forecast")
plt.ylabel("Solar Power (W/m2)")
plt.xlabel("Hour")

Text(0.5, 0, 'Hour')

png

MultiTask Lasso Regression

_prng = np.random.RandomState(42)

_param_distributions = {
    "alpha": stats.lognorm(s=1),
}

lasso_regression_randomized_search = model_selection.RandomizedSearchCV(
    lasso_regression,
    param_distributions=_param_distributions,
    scoring="neg_mean_squared_error",
    random_state=_prng,
    n_iter=10,
    cv=8,
    n_jobs=2,
    verbose=10
)

lasso_regression_randomized_search.fit(transformed_training_features, training_target)

Fitting 8 folds for each of 10 candidates, totalling 80 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:    7.0s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:    9.4s
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:   16.5s
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:   22.5s
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed:   30.3s
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:   38.4s
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:   48.6s
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:  1.0min
[Parallel(n_jobs=2)]: Done  57 tasks      | elapsed:  1.2min
[Parallel(n_jobs=2)]: Done  68 tasks      | elapsed:  1.5min
[Parallel(n_jobs=2)]: Done  80 out of  80 | elapsed:  1.8min finished





RandomizedSearchCV(cv=8, error_score='raise-deprecating',
                   estimator=MultiTaskLasso(alpha=1.0, copy_X=True,
                                            fit_intercept=True, max_iter=1000,
                                            normalize=False,
                                            random_state=<mtrand.RandomState object at 0x1a649d5870>,
                                            selection='cyclic', tol=0.0001,
                                            warm_start=False),
                   iid='warn', n_iter=10, n_jobs=2,
                   param_distributions={'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1a7af2b470>},
                   pre_dispatch='2*n_jobs',
                   random_state=<mtrand.RandomState object at 0x1a69473438>,
                   refit=True, return_train_score=False,
                   scoring='neg_mean_squared_error', verbose=10)

_ = joblib.dump(lasso_regression_randomized_search.best_estimator_,
                "../models/weekly/tuned-lasso-regression-model.pkl")

(-lasso_regression_randomized_search.best_score_)**0.5

17.956920842745834

user_forecast_request = transformed_validation_features[-7:, :]
user_forecast_response = lasso_regression_randomized_search.predict(user_forecast_request)
actual_values_response = validation_target.values[-7:, :]

# this would be rendered in Tableau!
fig, ax = plt.subplots(1, 1, figsize=(12, 8))
ax.plot(user_forecast_response.flatten(), label="predicted")
ax.plot(actual_values_response.flatten(), label="actual")
ax.legend()
ax.set_title("Last week of December 2015")
ax.set_ylabel("Solar Power (W/m2)")
ax.set_xlabel("Hours")

Text(0.5, 0, 'Hours')

png

Random Forest Regressor

_prng = np.random.RandomState(42)

_param_distributions = {
    "n_estimators": stats.geom(p=0.01),
     "min_samples_split": stats.beta(a=1, b=99),
     "min_samples_leaf": stats.beta(a=1, b=999),
}

_cv = model_selection.TimeSeriesSplit(max_train_size=None, n_splits=5)

random_forest_randomized_search = model_selection.RandomizedSearchCV(
    random_forest_regressor,
    param_distributions=_param_distributions,
    scoring="neg_mean_squared_error",
    random_state=_prng,
    n_iter=10,
    cv=3,
    n_jobs=2,
    verbose=10
)

random_forest_randomized_search.fit(transformed_training_features, training_target)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:  1.2min
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:  2.3min
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed: 14.0min
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed: 14.8min
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed: 15.4min
[Parallel(n_jobs=2)]: Done  30 out of  30 | elapsed: 18.4min finished





RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100, n_jobs=2,
                                                   oob_score=False,
                                                   random_state=<mt...
                   param_distributions={'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1a28d373c8>,
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1a28d375f8>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1a28d28630>},
                   pre_dispatch='2*n_jobs',
                   random_state=<mtrand.RandomState object at 0x1a2886e828>,
                   refit=True, return_train_score=False,
                   scoring='neg_mean_squared_error', verbose=10)

_ = joblib.dump(random_forest_randomized_search.best_estimator_,
                "../models/weekly/tuned-random-forest-regression-model.pkl")

random_forest_randomized_search.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=0.001249828663231378,
                      min_samples_split=0.0019415164208264953,
                      min_weight_fraction_leaf=0.0, n_estimators=75, n_jobs=2,
                      oob_score=False,
                      random_state=<mtrand.RandomState object at 0x1a292509d8>,
                      verbose=0, warm_start=False)

(-random_forest_randomized_search.best_score_)**0.5

19.002834611158974

# user requests forecast for 1 January 2016 which we predict using data from 31 December 2015!
user_forecast_request = transformed_validation_features[[-1], :]
user_forecast_response = random_forest_randomized_search.predict(user_forecast_request)[0]
actual_values_response = validation_target.values[[-1], :][0]

# this would be rendered in Tableau!
plt.plot(user_forecast_response, label="predicted")
plt.plot(actual_values_response, label="actual")
plt.legend()
plt.title("Typical weekly forecast")
plt.ylabel("Solar Power (W/m2)")
plt.xlabel("Hour")

png

Assess model performance on testing data

testing_features = testing_data.drop("target(W/m2)", axis=1, inplace=False)
testing_target = testing_data.loc[:, ["target(W/m2)"]]
transformed_testing_features = preprocess_features(testing_features)

elastic_net_predictions = elastic_net_randomized_search.predict(transformed_testing_features)
np.sqrt(metrics.mean_squared_error(testing_target, elastic_net_predictions))

19.76022744338562

lasso_regression_predictions = lasso_regression_randomized_search.predict(transformed_testing_features)
np.sqrt(metrics.mean_squared_error(testing_target, lasso_regression_predictions))

19.73446980783998

# random forest wins!
random_forest_predictions = random_forest_randomized_search.predict(transformed_testing_features)
np.sqrt(metrics.mean_squared_error(testing_target, random_forest_predictions))

19.826312994581507

# user requests forecast for last week of 2018
user_forecast_request = transformed_testing_features[[-1], :]
user_forecast_response = random_forest_randomized_search.predict(user_forecast_request)[0]
actual_values_response = testing_target.values[[-1], :][0]

# this would be rendered in Tableau!
fig, ax = plt.subplots(1, 1, figsize=(12, 8))
ax.plot(user_forecast_response.flatten(), label="predicted")
ax.plot(actual_values_response.flatten(), label="actual")
ax.legend()
ax.set_title("Typical weekly forecast")
ax.set_ylabel("Solar Power (W/m2)")
ax.set_xlabel("Hours")
plt.savefig("../results/img/typical-weekly-actual-vs-predicted-solar-power.png")

png

# combine the training and validtion data
combined_training_features = pd.concat([training_features, validation_features])
transformed_combined_training_features = preprocess_features(combined_training_features)
combined_training_target = pd.concat([training_target, validation_target])

# tune a random forest regressor using CV ro avoid overfitting
_prng = np.random.RandomState(42)

_param_distributions = {
    "n_estimators": stats.geom(p=0.01),
     "min_samples_split": stats.beta(a=1, b=99),
     "min_samples_leaf": stats.beta(a=1, b=999),
}

tuned_random_forest_regressor = model_selection.RandomizedSearchCV(
    ensemble.RandomForestRegressor(n_estimators=100, random_state=_prng),
    param_distributions=_param_distributions,
    scoring="neg_mean_squared_error",
    random_state=_prng,
    n_iter=10,
    cv=5,
    n_jobs=2,
    verbose=10
)

tuned_random_forest_regressor.fit(combined_training_features, combined_training_target)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:  2.5min
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:  4.9min
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed: 10.5min
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed: 14.2min
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed: 16.4min
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed: 17.9min
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed: 20.3min
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed: 26.2min
[Parallel(n_jobs=2)]: Done  50 out of  50 | elapsed: 28.3min finished
/Users/pughdr/Research/junctionx-kaust-2019/env/lib/python3.6/site-packages/sklearn/model_selection/_search.py:813: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)





RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                                                   n_jobs=None, oob_score=False,
                                                   random_state=...
                   param_distributions={'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1a32bef860>,
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1a32bef198>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1a32bef7f0>},
                   pre_dispatch='2*n_jobs',
                   random_state=<mtrand.RandomState object at 0x1a2cc608b8>,
                   refit=True, return_train_score=False,
                   scoring='neg_mean_squared_error', verbose=10)

tuned_random_forest_regressor.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=0.0004129137216396695,
                      min_samples_split=0.004789233605970227,
                      min_weight_fraction_leaf=0.0, n_estimators=78,
                      n_jobs=None, oob_score=False,
                      random_state=<mtrand.RandomState object at 0x1a32c13948>,
                      verbose=0, warm_start=False)

(-tuned_random_forest_regressor.best_score_)**0.5

18.785573993005393

# user requests forecast for last week of 2018
user_forecast_request = transformed_testing_features[[-1], :]
user_forecast_response = random_forest_randomized_search.predict(user_forecast_request)[0]
actual_values_response = testing_target.values[[-1], :][0]

# this would be rendered in Tableau!
fig, ax = plt.subplots(1, 1, figsize=(12, 8))
ax.plot(user_forecast_response.flatten(), label="predicted")
ax.plot(actual_values_response.flatten(), label="actual")
ax.legend()
ax.set_title("Typical weekly forecast")
ax.set_ylabel("Solar Power (W/m2)")
ax.set_xlabel("Hours")
plt.savefig("../results/img/typical-weekly-actual-vs-predicted-solar-power.png")

png

Forecasting the future of solar power at NEOM

Once the model is trained, the model can generate a new forecast for next week’s solar power generation. Once actual values of solar power generation are observed, model can be automatically re-trained and improved. Model can be retrained with weekly, monthly forecast horizons if longer forecasts are required.

incoming_features = features.loc[[687]]
new_predictions = tuned_random_forest_regressor.predict(incoming_features)[0]
solar_power_forecast = (pd.DataFrame.from_dict({"Timestamp": pd.date_range(start="2019-01-01", end="2019-01-07 23:00:00", freq='H'),
                                                "Predicted Solar Power (W/m2)": new_predictions})
                          .set_index("Timestamp", inplace=False))

_ = solar_power_forecast.plot()

png

weekly predicting solar power at neom

Ayachips

This is the jupyter notebook from our JunctionX kaust hackaton (weekly firecast), source is here

Predicting Solar Power Output at NEOM

Feature Engineering

Train, Test Split

Preprocessing the training and validation data

Find a few models that seem to work well

Linear Regression

MultiTask ElasticNet Regression

MultiTask Lasso Regression

Random Forest Regression

Tuning hyper-parameters

MultiTask ElasticNet Regression

MultiTask Lasso Regression

Random Forest Regressor

Assess model performance on testing data

Forecasting the future of solar power at NEOM