##################################################################################
##################################################################################
#-------------------
#1. Title of Dataset:  Meteorological Database
#
#2. Authorship: 
#	Research Group Solar and Wind Feasibility Technologies (SWIFT), Electromechanical Engineering Department, Universidad de Burgos, 09006 Burgos, Spain.
#
##################################################################################
##################################################################################
#FS Boruta
#DataSet: df = pd.read_csv('csv file directory')
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
#MI dataset
X_train = Predata
#CIE labels
y_train = df.cloud
#FS algorithm
rf_boruta = RandomForestClassifier(n_jobs=-1,n_estimators=1, random_state=0,max_depth =5,min_samples_split=50,min_samples_leaf=50,min_impurity_decrease=0.01)
boruta = BorutaPy(rf_boruta, random_state=0,n_estimators=100, verbose=2)
boruta.fit(X_train.values, y_train.values.ravel())
### print results
boruta.support_
##################################################################################
##################################################################################
#FS Permutation Importance
import eli5
from eli5.sklearn import PermutationImportance
from sklearn.svm import SVC
#MI dataset
X_train = Predata
#CIE labels
y_train = df.cloud
#FS algorithm
treetest = RandomForestClassifier(random_state=6,n_estimators=1).fit(X, y)
perm = PermutationImportance(estimator=treetest, random_state=150, scoring='f1_weighted')
perm.fit(X, y)
### print results
result = eli5.show_weights(perm, feature_names = X.columns.tolist())
##################################################################################
##################################################################################
#FS Pearson Correlation
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import graphviz

#DataSet: df = pd.read_csv('csv file directory')
cor = abs(df.corr())
### print results
plt.figure(figsize=(25,25))
sns.heatmap(cor, annot=True, cmap=plt.cm.Blues)
plt.title("Pearson Coeficient Matrix", fontsize='large')
plt.show()
##################################################################################
##################################################################################
#FS RFE
from sklearn.feature_selection import RFE
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

#MI dataset
X = Predata
#CIE labels
y = df.cloud
#The maximun number of features is the size of the MI considered
nof_list=np.arange(1,43)
high_score = 0
#storing the optimum number of features
score_list =[] 
for n in range(len(nof_list)): 
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.15, random_state = 0)
    model =RandomForestClassifier(random_state=10)
    rfe = RFE(model,nof_list[n], step=0.01)
    X_train_rfe = rfe.fit_transform(X_train,y_train)
    X_test_rfe = rfe.transform(X_test)
    model.fit(X_train_rfe,y_train)
    y_pred = rfe.predict(X_test)
    score = f1_score(y_test, y_pred, average='weighted')
    score_list.append(score)
    if(score>high_score):
        high_score = score
        nof = nof_list[n]
#Once the optimum number of features has been chosen, it is possible to use RFE
#Classification method
treetest = tree.DecisionTreeClassifier(random_state=1)
rfe = RFE(estimator=treetest, step=0.05, n_features_to_select = nof)
rfe.fit(X, y)
### print results
rfe.ranking_
##################################################################################
##################################################################################
##################################################################################
#CLASSIFICATION TREES
##################################################################################
##################################################################################
#Feature Selection helps to prevent redundant and useless information. 
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

import graphviz 

#1-ENTERING DATASET
# Variables is the list of the selected MI by FS process
feature_names = variables 
##Extraction of the MAN chosen by FS in the dataset (df)
features = df[feature_names].values
#CIE labels
labels = df['cloud'].values
#2-SPLITING DATASET
X_train, X_test, y_train, y_test = train_test_split(
    features,
    labels,
    test_size = 0.20,
    random_state=0
                                                   )
#3-PARAMETERS
parameters = {'criterion': ['entropy', 'gini'], 
              'min_samples_split' : range(150,1000,50),
              'min_samples_leaf' : range(150,1000,50),
              'max_depth': range(1,5,1),
              'random_state': range(1,100,1),
             }
#3-TRAINING
clf_tree = tree.DecisionTreeClassifier()
clf_grid = GridSearchCV(clf_tree, parameters, cv=4)
clf_grid.fit(X_train,y_train)
clf_best = clf_grid.best_estimator_
clf_best
#3-TREE
final_data = tree.export_graphviz(
    clf_best,
    out_file = None,
    feature_names = feature_names,
    class_names = ['clear', 'partial', 'cloudy'],
    filled = True,
    rounded = True,
    special_characters = True
)
graph = graphviz.Source(final_data)
graph
##################################################################################
##################################################################################
##################################################################################
##################################################################################
##################################################################################