import pandas as pd import numpy as np from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder from sklearn.tree import DecisionTreeClassifier, export_text from sklearn.model_selection import train_test_split # 1. Load our data set. data = pd.read_csv("election.csv") # 2. Use the sklearn encoders (ordinal, and one-hot) to prepare our data set. oe = OrdinalEncoder() ohe = OneHotEncoder(sparse_output=False) X_oe = oe.fit_transform(data[['home','votedLast']]) X_ohe = ohe.fit_transform(data[['registered']]) X = np.concatenate([X_oe, X_ohe], axis=1) X = pd.DataFrame(X, columns=np.concatenate([oe.get_feature_names_out(), ohe.get_feature_names_out()])) y = data[['candidate']] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25) # 3. Create and fit a decision tree to the training data. dtc = DecisionTreeClassifier() dtc.fit(X_train, y_train) # 4. Let's look at the tree it came up with... looksie = export_text(dtc, feature_names=X.columns) print(looksie) # 5. Run our decision tree on the test data, giving us an array of predictions, # one for each data point. y_pred = dtc.predict(X_test) # 6. Form an array of all the true (gold) values for these data points. y_test = np.array(y_test.candidate) # 7. Compute test set accuracy, in two different ways (the Caroline way, and # the Stephen way). count = 0 for i in range(len(y_pred)): if y_pred[i] == y_test[i]: count += 1 percent_correct_caroline = count / len(y_pred) * 100 print("The DT got {}% on the test set.".format(percent_correct_caroline)) percent_correct_stephen = sum(y_test == y_pred) / len(y_pred) * 100 print("The DT got {}% on the test set.".format(percent_correct_stephen)) ## 5. cross_val_score()