Create basic code to select topic, turn nlp into api

This commit is contained in:
Edward Li 2021-03-27 17:43:53 -04:00
parent 3141b3c1cd
commit 44f70ccda1
2 changed files with 41 additions and 26 deletions

50
nlp.py
View File

@ -1,6 +1,3 @@
import json
import pandas as pd import pandas as pd
import nltk import nltk
import numpy as np import numpy as np
@ -9,22 +6,14 @@ from nltk.stem import WordNetLemmatizer, SnowballStemmer
from gensim.parsing.preprocessing import STOPWORDS from gensim.parsing.preprocessing import STOPWORDS
from gensim.utils import simple_preprocess from gensim.utils import simple_preprocess
import gensim import gensim
from sklearn.datasets import fetch_20newsgroups from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train', shuffle=True)
newsgroups_test = fetch_20newsgroups(subset='test', shuffle=True) newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')
np.random.seed(400) np.random.seed(400)
stemmer = SnowballStemmer("english") stemmer = SnowballStemmer("english")
NUM_TOPICS = 7
# context = ssl._create_unverified_context()
def get_data():
pass
def lemmatize_stemming(text): def lemmatize_stemming(text):
@ -41,11 +30,13 @@ def preprocess(text):
return result return result
def categorize_str(s: str) -> int: def categorize_str(s: str, lda_model) -> int:
""" """
Takes in a string to determine which topic it belongs to Takes in a string to determine which topic it belongs to
Returns the topic number as an int Returns the topic number as an int
""" """
processed_doc = preprocess(s)
dictionary = gensim.corpora.Dictionary([processed_doc])
bow_vector = dictionary.doc2bow(preprocess(s)) bow_vector = dictionary.doc2bow(preprocess(s))
ldaResults = sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]) ldaResults = sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1])
return ldaResults[0][0] return ldaResults[0][0]
@ -54,25 +45,32 @@ def categorize_str(s: str) -> int:
def create_model(documents: list): def create_model(documents: list):
""" """
Takes a list of strings to create model Takes a list of strings to create model
returns the lda model and dictionary returns the lda model
""" """
processed_docs = [] processed_docs = []
for doc in newsgroups_train.data: for doc in documents:
processed_docs.append(preprocess(doc)) processed_docs.append(preprocess(doc))
dictionary = gensim.corpora.Dictionary(processed_docs) dictionary = gensim.corpora.Dictionary(processed_docs)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs] bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
lda_model = gensim.models.LdaMulticore(bow_corpus, lda_model = gensim.models.LdaMulticore(bow_corpus,
num_topics=7, num_topics=NUM_TOPICS,
id2word=dictionary, id2word=dictionary,
passes=10, passes=10,
workers=2) workers=2)
return(lda_model, dictionary) return lda_model
lda_model, dictionary = create_model(newsgroups_train.data) def update_model(s: str, lda_model):
for idx, topic in lda_model.show_topics(formatted=False, num_words=30): """
print('Topic: {} \nWords: {}'.format(idx, [w[0] for w in topic])) Takes in a string to update model
Trains model using string
"""
processed_doc = preprocess(s)
dictionary = gensim.corpora.Dictionary([processed_doc])
bow_corpus = [dictionary.doc2bow(processed_doc)]
lda_model.update(bow_corpus)
for ind in range(len(newsgroups_test)):
unseenDoc = newsgroups_test.data[ind] # lda_model = create_model(newsgroups_train.data)
print(ind, categorize_str(unseenDoc)) # update_model("Hello everyone", lda_model)
# print(categorize_str("Hello world", lda_model))

17
selector.py Normal file
View File

@ -0,0 +1,17 @@
import nlp
import random
# get user preference from database (i.e. how many times they clicked on some certain type of article)
# prob = [1/nlp.NUM_TOPICS for i in range(nlp.NUM_TOPICS)]
# manipulate prob based on user preference
def get_topics(weights, num_reccomendations):
"""
Takes in weights as list/tuple, ex: (0.1, 0.2, 0.3)
Returns a list of topics
"""
return random.choices([*range(nlp.NUM_TOPICS)], weights, k=num_reccomendations)
# print(get_topics(prob, 4))