from spacy.lang.en import English from spacy.pipeline import Sentencizer from keras.preprocessing.text import Tokenizer from keras.layers import Input, Dense, Embedding, Flatten from keras.models import Model from tensorflow.keras.preprocessing.sequence import pad_sequences import numpy as np import re nlp = English() sentencizer = Sentencizer() np.set_printoptions(precision=2) with open("anime.txt", encoding="utf-8") as f: anime_text = re.sub(r'\[\d\]','',f.read()) an = sentencizer(nlp(anime_text)) anime_sents = [ span.text.strip() for span in an.sents ] with open("basketball.txt", encoding="utf-8") as f: basketball_text = re.sub(r'\[\d\]','',f.read()) ba = sentencizer(nlp(basketball_text)) basketball_sents = [ span.text.strip() for span in ba.sents ] labels = np.concatenate([ np.repeat(0, len(anime_sents)), np.repeat(1, len(basketball_sents)), ]) t = Tokenizer() t.fit_on_texts(anime_sents + basketball_sents) vocab_size = len(t.word_index) + 1 E = np.zeros(shape=(vocab_size, 50)) with open("glove.6B.50d.txt", encoding='utf-8') as f: for line in f: stuff = line.split() word = stuff[0] numbers = np.array([ float(x) for x in stuff[1:] ]) if word in t.word_index: print(f"Yes! adding embedding for word {word} (at row {t.word_index[word]})") E[t.word_index[word],:] = numbers encoded = t.texts_to_sequences(anime_sents + basketball_sents) padded = pad_sequences(encoded, maxlen=10, padding="post") input_layer = Input(shape=(padded.shape[1],)) e_layer = Embedding(vocab_size, 50, input_length=10, weights=[E], trainable=False)(input_layer) f_layer = Flatten()(e_layer) hidden_layer = Dense(20, activation='relu')(f_layer) output_layer = Dense(1, activation='sigmoid')(hidden_layer) model = Model(inputs=input_layer, outputs=output_layer) model.compile(loss='binary_crossentropy', metrics=['accuracy']) model.fit(padded, labels, epochs=20, verbose=0) sent = input("Enter a sentence: ") while (sent != 'done'): encoded_sent = t.texts_to_sequences([sent]) padded_sent = pad_sequences(encoded_sent, maxlen=10, padding="post") print(model(padded_sent)) sent = input("Enter a sentence: ")