diff --git a/nlp.py b/nlp.py index 036e929..fd126c3 100644 --- a/nlp.py +++ b/nlp.py @@ -1,6 +1,3 @@ - - -import json import pandas as pd import nltk import numpy as np @@ -9,22 +6,14 @@ from nltk.stem import WordNetLemmatizer, SnowballStemmer from gensim.parsing.preprocessing import STOPWORDS from gensim.utils import simple_preprocess import gensim - - from sklearn.datasets import fetch_20newsgroups -newsgroups_train = fetch_20newsgroups(subset='train', shuffle=True) -newsgroups_test = fetch_20newsgroups(subset='test', shuffle=True) + +newsgroups_train = fetch_20newsgroups(subset='train') +newsgroups_test = fetch_20newsgroups(subset='test') np.random.seed(400) - - stemmer = SnowballStemmer("english") - -# context = ssl._create_unverified_context() - - -def get_data(): - pass +NUM_TOPICS = 7 def lemmatize_stemming(text): @@ -41,11 +30,13 @@ def preprocess(text): return result -def categorize_str(s: str) -> int: +def categorize_str(s: str, lda_model) -> int: """ Takes in a string to determine which topic it belongs to Returns the topic number as an int """ + processed_doc = preprocess(s) + dictionary = gensim.corpora.Dictionary([processed_doc]) bow_vector = dictionary.doc2bow(preprocess(s)) ldaResults = sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]) return ldaResults[0][0] @@ -54,25 +45,32 @@ def categorize_str(s: str) -> int: def create_model(documents: list): """ Takes a list of strings to create model - returns the lda model and dictionary + returns the lda model """ processed_docs = [] - for doc in newsgroups_train.data: + for doc in documents: processed_docs.append(preprocess(doc)) dictionary = gensim.corpora.Dictionary(processed_docs) bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs] lda_model = gensim.models.LdaMulticore(bow_corpus, - num_topics=7, + num_topics=NUM_TOPICS, id2word=dictionary, passes=10, workers=2) - return(lda_model, dictionary) + return lda_model -lda_model, dictionary = create_model(newsgroups_train.data) -for idx, topic in lda_model.show_topics(formatted=False, num_words=30): - print('Topic: {} \nWords: {}'.format(idx, [w[0] for w in topic])) +def update_model(s: str, lda_model): + """ + Takes in a string to update model + Trains model using string + """ + processed_doc = preprocess(s) + dictionary = gensim.corpora.Dictionary([processed_doc]) + bow_corpus = [dictionary.doc2bow(processed_doc)] + lda_model.update(bow_corpus) -for ind in range(len(newsgroups_test)): - unseenDoc = newsgroups_test.data[ind] - print(ind, categorize_str(unseenDoc)) + +# lda_model = create_model(newsgroups_train.data) +# update_model("Hello everyone", lda_model) +# print(categorize_str("Hello world", lda_model)) diff --git a/selector.py b/selector.py new file mode 100644 index 0000000..422483a --- /dev/null +++ b/selector.py @@ -0,0 +1,17 @@ +import nlp +import random + +# get user preference from database (i.e. how many times they clicked on some certain type of article) +# prob = [1/nlp.NUM_TOPICS for i in range(nlp.NUM_TOPICS)] +# manipulate prob based on user preference + + +def get_topics(weights, num_reccomendations): + """ + Takes in weights as list/tuple, ex: (0.1, 0.2, 0.3) + Returns a list of topics + """ + return random.choices([*range(nlp.NUM_TOPICS)], weights, k=num_reccomendations) + + +# print(get_topics(prob, 4))