import pandas as pd e = pd.read_csv('election.csv') def find_best_split_col(df): feats = df.columns[:-1] target = df.columns[-1] rights = pd.Series([]) for feat in feats: print("Considering splitting on {}...".format(feat)) vals = df[feat].value_counts().keys() right = 0 for val in vals: right += df[df[feat] == val][target].value_counts().max() print(" {}-{}:{}".format(feat,val, df[df[feat] == val][target].value_counts().max())) print("Splitting on {} gives us {} right.".format(feat,right)) rights[feat] = right return rights root = find_best_split_col(e).idxmax() print("Root: split on {}!".format(root)) for val in e[root].value_counts().keys(): print() branch = e[e[root] == val].drop(root, axis=1) node = find_best_split_col(branch).idxmax() print(" {}={}: split on {}!".format(root,val,node))