from spacy.lang.en import English from spacy.pipeline import Sentencizer from keras.preprocessing.text import Tokenizer from keras.layers import Input, Dense from keras.models import Model import numpy as np import re from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt nlp = English() sentencizer = Sentencizer() np.set_printoptions(precision=2) with open("anime.txt", encoding="utf-8") as f: anime_text = re.sub(r'\[\d\]','',f.read()) an = sentencizer(nlp(anime_text)) anime_sents = [ span.text.strip() for span in an.sents ] with open("basketball.txt", encoding="utf-8") as f: basketball_text = re.sub(r'\[\d\]','',f.read()) ba = sentencizer(nlp(basketball_text)) basketball_sents = [ span.text.strip() for span in ba.sents ] labels = np.concatenate([ np.repeat(0, len(anime_sents)), np.repeat(1, len(basketball_sents)), ]) t = Tokenizer() t.fit_on_texts(sents) def evaluate_model(sents, labels, mode, num_epochs, num_trials): dtm = t.texts_to_matrix(sents, mode=mode) accuracies = np.empty(num_trials) for trial in range(num_trials): print(f" {trial}...") Xtrain, Xtest, ytrain, ytest = train_test_split(dtm, labels, test_size=.2) input_layer = Input(shape=(dtm.shape[1],)) hidden_layer = Dense(20, activation='relu')(input_layer) output_layer = Dense(1, activation='sigmoid')(hidden_layer) model = Model(inputs=input_layer, outputs=output_layer) model.compile(loss='binary_crossentropy', metrics=['accuracy']) model.fit(Xtrain, ytrain, epochs=num_epochs, verbose=0) accuracies[trial] = model.evaluate(Xtest, ytest)[1] return accuracies res = {} modes = ['binary','count','freq','tfidf'] for mode in modes: print(f"Calculating {mode}...") acc = evaluate_model(anime_sents + basketball_sents, labels, mode, 2, 10) res[mode] = acc plt.clf() plt.boxplot(res.values()) plt.xticks(range(1,len(modes)+1), modes) plt.ylim((0,1)) plt.savefig("hist.png")