From d568242cd00723295a05bb215e79baf6e475ad13 Mon Sep 17 00:00:00 2001 From: reedts Date: Mon, 6 Aug 2018 13:28:27 +0200 Subject: [PATCH] Added stub for k-fold cross validation --- pywatts/__init__.py | 3 +- pywatts/kcross.py | 61 ++++++++++++++++++++++++++++++++++++ pywatts/neural.py | 11 ++++--- pywatts/test_kcross_train.py | 41 ++++++++++++++++++++++++ pywatts/test_predict.py | 1 + pywatts/test_train.py | 6 ++-- 6 files changed, 115 insertions(+), 8 deletions(-) create mode 100644 pywatts/kcross.py create mode 100644 pywatts/test_kcross_train.py diff --git a/pywatts/__init__.py b/pywatts/__init__.py index c71aa1f..37f9f3a 100644 --- a/pywatts/__init__.py +++ b/pywatts/__init__.py @@ -1,4 +1,5 @@ from pywatts import db from pywatts import fetchdata from pywatts import neural -from pywatts import main \ No newline at end of file +from pywatts import main +from pywatts import kcross \ No newline at end of file diff --git a/pywatts/kcross.py b/pywatts/kcross.py new file mode 100644 index 0000000..3b28a15 --- /dev/null +++ b/pywatts/kcross.py @@ -0,0 +1,61 @@ +import random +import itertools +from pywatts import db + + +def split(data, k): + """Returns (X_train, y_train, X_eval, y_eval)""" + + # Training features as list of dictionaries (each dict is for ONE test run) + X_train = [] + # Training labels as list of dictionaries (each dict is for ONE test run) + y_train = [] + # Evaluation features as list of dictionaries (each i-th dict includes all features except X_train[i]) + X_eval = [] + # Evaluation labels as list of dictionaries (each i-th dict includes all labels except X_train[i]) + y_eval = [] + + data_list = data['dc'].tolist() + + # Each sample has 337 elements + samples = [data_list[i:i+337] for i in range(0, len(data_list) - 337, 337)] + # Randomly shuffle samples + random.shuffle(samples) + + for i in range(0, len(samples), k): + # Create new dictionaries in the eval lists + X_eval.append({'dc': [x for x in itertools.chain(samples[i:i+k])]}) + y_eval.append({'dc': []}) + + + for i in range(len(X_eval)): + X_train.append({'dc': []}) + y_train.append({'dc': []}) + for c, d in enumerate(X_eval): + if c != i: + X_train[i]['dc'].extend(d['dc']) + y_train[i]['dc'].append(y_eval[c]['dc']) + + print(X_train) + print(y_train) + exit(0) + + return X_train, y_train, X_eval, y_eval + + +def train(nn, X_train, y_train, X_eval, y_eval, steps=10): + """Trains the Network nn using k-cross-validation""" + evaluation = [] + for count, train_data in enumerate(X_train): + for i in range(steps): + nn.train(train_data, y_train[count], batch_size=int(len(train_data['dc'])/336), steps=1) + print(X_eval[count]) + print(len(X_eval[count]['dc'])) + print(y_eval[count]) + evaluation.append(nn.evaluate(X_eval[count], y_eval[count], batch_size=int(len(X_eval[count]['dc'])/336))) + print("Training %s: %s/%s" % (count, (i+1), steps)) + + + + + diff --git a/pywatts/neural.py b/pywatts/neural.py index 2a7a548..bbe0b93 100644 --- a/pywatts/neural.py +++ b/pywatts/neural.py @@ -1,11 +1,13 @@ import pandas +import numpy as np import tensorflow as tf def pywatts_input_fn(X, y=None, num_epochs=None, shuffle=True, batch_size=1): # Create dictionary for features in hour 0 ... 335 features = {str(idx): [] for idx in range(336)} - dc_values = X['dc'].tolist() + #dc_values = X['dc'].tolist() + dc_values = X['dc'] # Iterate the empty dictionary always adding the idx-th element from the dc_values list for idx, value_list in features.items(): @@ -13,7 +15,8 @@ def pywatts_input_fn(X, y=None, num_epochs=None, shuffle=True, batch_size=1): labels = None if y is not None: - labels = y['dc'].values + #labels = y['dc'].values + labels = y['dc'] if labels is None: dataset = tf.data.Dataset.from_tensor_slices(dict(features)) @@ -38,8 +41,8 @@ class Net: def train(self, training_data, training_results, batch_size, steps): self.__regressor.train(input_fn=lambda: pywatts_input_fn(training_data, y=training_results, num_epochs=None, shuffle=True, batch_size=batch_size), steps=steps) - def evaluate(self, eval_data, eval_results): - return self.__regressor.evaluate(input_fn=lambda: pywatts_input_fn(eval_data, y=eval_results, num_epochs=1, shuffle=False), steps=1) + def evaluate(self, eval_data, eval_results, batch_size=1): + return self.__regressor.evaluate(input_fn=lambda: pywatts_input_fn(eval_data, y=eval_results, num_epochs=1, shuffle=False, batch_size=batch_size), steps=1) def predict1h(self, predict_data): return self.__regressor.predict(input_fn=lambda: pywatts_input_fn(predict_data, num_epochs=1, shuffle=False)) diff --git a/pywatts/test_kcross_train.py b/pywatts/test_kcross_train.py new file mode 100644 index 0000000..807c0c4 --- /dev/null +++ b/pywatts/test_kcross_train.py @@ -0,0 +1,41 @@ +import peewee +import tensorflow as tf +import pywatts.db +from pywatts import kcross + +NUM_STATIONS_FROM_DB = 75 +K = 4 +NUM_EVAL_STATIONS = 40 +TRAIN = True +PLOT = True +TRAIN_STEPS = 4 + + +df = pywatts.db.rows_to_df(list(range(1, NUM_STATIONS_FROM_DB))) +X = df +y = df['dc'] + + +# Define feature columns and initialize Regressor +feature_col = [tf.feature_column.numeric_column(str(idx)) for idx in range(336)] +n = pywatts.neural.Net(feature_cols=feature_col) + + +# Training data +(X_train, y_train, X_eval, y_eval) = kcross.split(df, K) + + +train_eval = {} + +if TRAIN: + # Train the model with the steps given + train_eval = kcross.train(n, X_train, y_train, X_eval, y_eval, TRAIN_STEPS) + + + +if PLOT: + # Plot training success rate (with 'average loss') + pywatts.main.plot_training(train_eval) + + +exit() diff --git a/pywatts/test_predict.py b/pywatts/test_predict.py index 3924fba..7b76a5c 100644 --- a/pywatts/test_predict.py +++ b/pywatts/test_predict.py @@ -19,5 +19,6 @@ n = pywatts.neural.Net(feature_cols=feature_col) prediction = predict(n, pred_query) print(prediction) +print(pred_result) pywatts.main.eval_prediction(prediction, pred_result) diff --git a/pywatts/test_train.py b/pywatts/test_train.py index e18093e..a378485 100644 --- a/pywatts/test_train.py +++ b/pywatts/test_train.py @@ -4,11 +4,11 @@ import pywatts.db from pywatts.main import * NUM_STATIONS_FROM_DB = 75 -NUM_TRAIN_STATIONS = 60 -NUM_EVAL_STATIONS = 15 +NUM_TRAIN_STATIONS = 400 +NUM_EVAL_STATIONS = 40 TRAIN = True PLOT = True -TRAIN_STEPS = 10 +TRAIN_STEPS = 50 df = pywatts.db.rows_to_df(list(range(1, NUM_STATIONS_FROM_DB)))