import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.model_selection import train_test_split

# 1. Load our data set.

data = pd.read_csv("election.csv")


# 2. Use the sklearn encoders (ordinal, and one-hot) to prepare our data set.
oe = OrdinalEncoder()
ohe = OneHotEncoder(sparse_output=False)
X_oe = oe.fit_transform(data[['home','votedLast']])
X_ohe = ohe.fit_transform(data[['registered']])
X = np.concatenate([X_oe, X_ohe], axis=1)
X = pd.DataFrame(X, columns=np.concatenate([oe.get_feature_names_out(),
    ohe.get_feature_names_out()]))
y = data[['candidate']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25)

# 3. Create and fit a decision tree to the training data.
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

# 4. Let's look at the tree it came up with...
looksie = export_text(dtc, feature_names=X.columns)
print(looksie)

# 5. Run our decision tree on the test data, giving us an array of predictions,
# one for each data point.
y_pred = dtc.predict(X_test)

# 6. Form an array of all the true (gold) values for these data points.
y_test = np.array(y_test.candidate)

# 7. Compute test set accuracy, in two different ways (the Caroline way, and
# the Stephen way).
count = 0
for i in range(len(y_pred)):
    if y_pred[i] == y_test[i]:
        count += 1
percent_correct_caroline = count / len(y_pred) * 100
print("The DT got {}% on the test set.".format(percent_correct_caroline))

percent_correct_stephen = sum(y_test == y_pred) / len(y_pred) * 100
print("The DT got {}% on the test set.".format(percent_correct_stephen))

## 5. cross_val_score()