from spacy.lang.en import English
from spacy.pipeline import Sentencizer
from keras.preprocessing.text import Tokenizer
from keras.layers import Input, Dense
from keras.models import Model
import numpy as np
import re

nlp = English()
sentencizer = Sentencizer()
np.set_printoptions(precision=2)

with open("anime.txt", encoding="utf-8") as f:
    anime_text = re.sub(r'\[\d\]','',f.read())
    an = sentencizer(nlp(anime_text))
    anime_sents = [ span.text.strip() for span in an.sents ]

with open("basketball.txt", encoding="utf-8") as f:
    basketball_text = re.sub(r'\[\d\]','',f.read())
    ba = sentencizer(nlp(basketball_text))
    basketball_sents = [ span.text.strip() for span in ba.sents ]

with open("everything.txt", encoding="utf-8") as f:
    everything_text = re.sub(r'\[\d\]','',f.read())
    ev = sentencizer(nlp(everything_text))
    everything_sents = [ span.text.strip() for span in ev.sents ]

labels = np.concatenate([
    np.repeat(np.array([1,0,0]).reshape((1,3)), len(anime_sents), axis=0),
    np.repeat(np.array([0,1,0]).reshape((1,3)), len(basketball_sents), axis=0),
    np.repeat(np.array([0,0,1]).reshape((1,3)), len(everything_sents), axis=0),
    ])

t = Tokenizer()
t.fit_on_texts(anime_sents + basketball_sents + everything_sents)
dtm = t.texts_to_matrix(anime_sents + basketball_sents + everything_sents, mode='tfidf')

input_layer = Input(shape=(dtm.shape[1],))
hidden_layer = Dense(20, activation='relu')(input_layer)
output_layer = Dense(3, activation='softmax')(hidden_layer)
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(dtm, labels, epochs=20, verbose=0)


sent = input("Enter a sentence: ")
while (sent != 'done'):
    coded = t.texts_to_matrix([ sent ], mode='tfidf')
    print(model.predict(coded))
    sent = input("Enter a sentence: ")