From 8aab29759578013ea0e107b5cf2dac0b755597fe Mon Sep 17 00:00:00 2001
From: Edward Li <LegendEddie18@gmail.com>
Date: Sat, 27 Mar 2021 23:34:32 -0400
Subject: [PATCH] Add NLP code

---
 backend/nlp/nlp.py           | 82 ++++++++++++++++++++++++++++++++++++
 backend/nlp/requirements.txt |  5 +++
 backend/nlp/selector.py      | 14 ++++++
 3 files changed, 101 insertions(+)
 create mode 100644 backend/nlp/nlp.py
 create mode 100644 backend/nlp/requirements.txt
 create mode 100644 backend/nlp/selector.py

diff --git a/backend/nlp/nlp.py b/backend/nlp/nlp.py
new file mode 100644
index 0000000..13dc023
--- /dev/null
+++ b/backend/nlp/nlp.py
@@ -0,0 +1,82 @@
+import pandas as pd
+import nltk
+import numpy as np
+import gensim
+from nltk.stem.porter import *
+from nltk.stem import WordNetLemmatizer, SnowballStemmer
+from gensim.parsing.preprocessing import STOPWORDS
+from gensim.utils import simple_preprocess
+from sklearn.datasets import fetch_20newsgroups
+
+newsgroups_train = fetch_20newsgroups(subset='train')
+newsgroups_test = fetch_20newsgroups(subset='test')
+
+np.random.seed(400)
+stemmer = SnowballStemmer("english")
+NUM_TOPICS = 10
+
+
+def lemmatize_stemming(text):
+    # Tokenize and lemmatize
+    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
+
+
+def preprocess(text):
+    result = []
+    for token in gensim.utils.simple_preprocess(text):
+        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
+            result.append(lemmatize_stemming(token))
+
+    return result
+
+
+def categorize_str(s: str, lda_model, dictionary) -> int:
+    """
+    Takes in a string to determine which topic it belongs to
+    Returns the topic number as an int
+    """
+    processed_doc = preprocess(s)
+    # dictionary = gensim.corpora.Dictionary([processed_doc])
+    bow_vector = dictionary.doc2bow(preprocess(s))
+    ldaResults = sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1])
+    return ldaResults[0][0]
+
+
+def create_model(documents: list):
+    """
+    Takes a list of strings to create model
+    returns the lda model
+    """
+    processed_docs = []
+    for doc in documents:
+        processed_docs.append(preprocess(doc))
+    dictionary = gensim.corpora.Dictionary(processed_docs)
+    bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
+    lda_model = gensim.models.LdaMulticore(bow_corpus,
+                                           num_topics=NUM_TOPICS,
+                                           id2word=dictionary,
+                                           passes=10,
+                                           workers=2)
+    return (lda_model, dictionary)
+
+
+def update_model(s: str, lda_model, dictionary):
+    """
+    Takes in a string to update model
+    Trains model using string
+    """
+    processed_doc = preprocess(s)
+    # dictionary = gensim.corpora.Dictionary([processed_doc])
+    dictionary.add_documents([processed_doc])
+    bow_corpus = [dictionary.doc2bow(processed_doc)]
+    lda_model.update(bow_corpus)
+
+
+# lda_model, dictionary = create_model(newsgroups_train.data)
+# print(dictionary.num_docs)
+# print(categorize_str("finance", lda_model, dictionary))
+# print(categorize_str("football", lda_model, dictionary))
+# print(categorize_str("virus", lda_model, dictionary))
+# print(categorize_str("economy", lda_model, dictionary))
+# update_model("Hello everyone", lda_model, dictionary)
+# print(categorize_str("Hello world", lda_model, dictionary))
diff --git a/backend/nlp/requirements.txt b/backend/nlp/requirements.txt
new file mode 100644
index 0000000..442cd12
--- /dev/null
+++ b/backend/nlp/requirements.txt
@@ -0,0 +1,5 @@
+pandas==1.0.3
+nltk==3.5
+numpy==1.20.1
+gensim==4.0.0
+scikit-learn==0.24.1
\ No newline at end of file
diff --git a/backend/nlp/selector.py b/backend/nlp/selector.py
new file mode 100644
index 0000000..e86653f
--- /dev/null
+++ b/backend/nlp/selector.py
@@ -0,0 +1,14 @@
+import nlp
+import random
+
+# get user preference from database (i.e. how many times they clicked on some certain type of article)
+# prob = [1/nlp.NUM_TOPICS for i in range(nlp.NUM_TOPICS)]
+# manipulate prob based on user preference
+
+
+def get_topics(weights, num_reccomendations):
+    """
+    Takes in weights as list/tuple, ex: (0.1, 0.2, 0.3)
+    Returns a list of topics
+    """
+    return random.choices([*range(nlp.NUM_TOPICS)], weights, k=num_reccomendations)