diff --git a/backend/nlp/nlp.py b/backend/nlp/nlp.py new file mode 100644 index 0000000..13dc023 --- /dev/null +++ b/backend/nlp/nlp.py @@ -0,0 +1,82 @@ +import pandas as pd +import nltk +import numpy as np +import gensim +from nltk.stem.porter import * +from nltk.stem import WordNetLemmatizer, SnowballStemmer +from gensim.parsing.preprocessing import STOPWORDS +from gensim.utils import simple_preprocess +from sklearn.datasets import fetch_20newsgroups + +newsgroups_train = fetch_20newsgroups(subset='train') +newsgroups_test = fetch_20newsgroups(subset='test') + +np.random.seed(400) +stemmer = SnowballStemmer("english") +NUM_TOPICS = 10 + + +def lemmatize_stemming(text): + # Tokenize and lemmatize + return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v')) + + +def preprocess(text): + result = [] + for token in gensim.utils.simple_preprocess(text): + if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3: + result.append(lemmatize_stemming(token)) + + return result + + +def categorize_str(s: str, lda_model, dictionary) -> int: + """ + Takes in a string to determine which topic it belongs to + Returns the topic number as an int + """ + processed_doc = preprocess(s) + # dictionary = gensim.corpora.Dictionary([processed_doc]) + bow_vector = dictionary.doc2bow(preprocess(s)) + ldaResults = sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]) + return ldaResults[0][0] + + +def create_model(documents: list): + """ + Takes a list of strings to create model + returns the lda model + """ + processed_docs = [] + for doc in documents: + processed_docs.append(preprocess(doc)) + dictionary = gensim.corpora.Dictionary(processed_docs) + bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs] + lda_model = gensim.models.LdaMulticore(bow_corpus, + num_topics=NUM_TOPICS, + id2word=dictionary, + passes=10, + workers=2) + return (lda_model, dictionary) + + +def update_model(s: str, lda_model, dictionary): + """ + Takes in a string to update model + Trains model using string + """ + processed_doc = preprocess(s) + # dictionary = gensim.corpora.Dictionary([processed_doc]) + dictionary.add_documents([processed_doc]) + bow_corpus = [dictionary.doc2bow(processed_doc)] + lda_model.update(bow_corpus) + + +# lda_model, dictionary = create_model(newsgroups_train.data) +# print(dictionary.num_docs) +# print(categorize_str("finance", lda_model, dictionary)) +# print(categorize_str("football", lda_model, dictionary)) +# print(categorize_str("virus", lda_model, dictionary)) +# print(categorize_str("economy", lda_model, dictionary)) +# update_model("Hello everyone", lda_model, dictionary) +# print(categorize_str("Hello world", lda_model, dictionary)) diff --git a/backend/nlp/requirements.txt b/backend/nlp/requirements.txt new file mode 100644 index 0000000..442cd12 --- /dev/null +++ b/backend/nlp/requirements.txt @@ -0,0 +1,5 @@ +pandas==1.0.3 +nltk==3.5 +numpy==1.20.1 +gensim==4.0.0 +scikit-learn==0.24.1 \ No newline at end of file diff --git a/backend/nlp/selector.py b/backend/nlp/selector.py new file mode 100644 index 0000000..e86653f --- /dev/null +++ b/backend/nlp/selector.py @@ -0,0 +1,14 @@ +import nlp +import random + +# get user preference from database (i.e. how many times they clicked on some certain type of article) +# prob = [1/nlp.NUM_TOPICS for i in range(nlp.NUM_TOPICS)] +# manipulate prob based on user preference + + +def get_topics(weights, num_reccomendations): + """ + Takes in weights as list/tuple, ex: (0.1, 0.2, 0.3) + Returns a list of topics + """ + return random.choices([*range(nlp.NUM_TOPICS)], weights, k=num_reccomendations)