mirror of
https://github.com/Rushilwiz/openly.git
synced 2025-04-06 21:30:17 -04:00
Add NLP code
This commit is contained in:
parent
5c0f08d524
commit
8aab297595
82
backend/nlp/nlp.py
Normal file
82
backend/nlp/nlp.py
Normal file
|
@ -0,0 +1,82 @@
|
||||||
|
import pandas as pd
|
||||||
|
import nltk
|
||||||
|
import numpy as np
|
||||||
|
import gensim
|
||||||
|
from nltk.stem.porter import *
|
||||||
|
from nltk.stem import WordNetLemmatizer, SnowballStemmer
|
||||||
|
from gensim.parsing.preprocessing import STOPWORDS
|
||||||
|
from gensim.utils import simple_preprocess
|
||||||
|
from sklearn.datasets import fetch_20newsgroups
|
||||||
|
|
||||||
|
newsgroups_train = fetch_20newsgroups(subset='train')
|
||||||
|
newsgroups_test = fetch_20newsgroups(subset='test')
|
||||||
|
|
||||||
|
np.random.seed(400)
|
||||||
|
stemmer = SnowballStemmer("english")
|
||||||
|
NUM_TOPICS = 10
|
||||||
|
|
||||||
|
|
||||||
|
def lemmatize_stemming(text):
|
||||||
|
# Tokenize and lemmatize
|
||||||
|
return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess(text):
|
||||||
|
result = []
|
||||||
|
for token in gensim.utils.simple_preprocess(text):
|
||||||
|
if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
|
||||||
|
result.append(lemmatize_stemming(token))
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def categorize_str(s: str, lda_model, dictionary) -> int:
|
||||||
|
"""
|
||||||
|
Takes in a string to determine which topic it belongs to
|
||||||
|
Returns the topic number as an int
|
||||||
|
"""
|
||||||
|
processed_doc = preprocess(s)
|
||||||
|
# dictionary = gensim.corpora.Dictionary([processed_doc])
|
||||||
|
bow_vector = dictionary.doc2bow(preprocess(s))
|
||||||
|
ldaResults = sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1])
|
||||||
|
return ldaResults[0][0]
|
||||||
|
|
||||||
|
|
||||||
|
def create_model(documents: list):
|
||||||
|
"""
|
||||||
|
Takes a list of strings to create model
|
||||||
|
returns the lda model
|
||||||
|
"""
|
||||||
|
processed_docs = []
|
||||||
|
for doc in documents:
|
||||||
|
processed_docs.append(preprocess(doc))
|
||||||
|
dictionary = gensim.corpora.Dictionary(processed_docs)
|
||||||
|
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
|
||||||
|
lda_model = gensim.models.LdaMulticore(bow_corpus,
|
||||||
|
num_topics=NUM_TOPICS,
|
||||||
|
id2word=dictionary,
|
||||||
|
passes=10,
|
||||||
|
workers=2)
|
||||||
|
return (lda_model, dictionary)
|
||||||
|
|
||||||
|
|
||||||
|
def update_model(s: str, lda_model, dictionary):
|
||||||
|
"""
|
||||||
|
Takes in a string to update model
|
||||||
|
Trains model using string
|
||||||
|
"""
|
||||||
|
processed_doc = preprocess(s)
|
||||||
|
# dictionary = gensim.corpora.Dictionary([processed_doc])
|
||||||
|
dictionary.add_documents([processed_doc])
|
||||||
|
bow_corpus = [dictionary.doc2bow(processed_doc)]
|
||||||
|
lda_model.update(bow_corpus)
|
||||||
|
|
||||||
|
|
||||||
|
# lda_model, dictionary = create_model(newsgroups_train.data)
|
||||||
|
# print(dictionary.num_docs)
|
||||||
|
# print(categorize_str("finance", lda_model, dictionary))
|
||||||
|
# print(categorize_str("football", lda_model, dictionary))
|
||||||
|
# print(categorize_str("virus", lda_model, dictionary))
|
||||||
|
# print(categorize_str("economy", lda_model, dictionary))
|
||||||
|
# update_model("Hello everyone", lda_model, dictionary)
|
||||||
|
# print(categorize_str("Hello world", lda_model, dictionary))
|
5
backend/nlp/requirements.txt
Normal file
5
backend/nlp/requirements.txt
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
pandas==1.0.3
|
||||||
|
nltk==3.5
|
||||||
|
numpy==1.20.1
|
||||||
|
gensim==4.0.0
|
||||||
|
scikit-learn==0.24.1
|
14
backend/nlp/selector.py
Normal file
14
backend/nlp/selector.py
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
import nlp
|
||||||
|
import random
|
||||||
|
|
||||||
|
# get user preference from database (i.e. how many times they clicked on some certain type of article)
|
||||||
|
# prob = [1/nlp.NUM_TOPICS for i in range(nlp.NUM_TOPICS)]
|
||||||
|
# manipulate prob based on user preference
|
||||||
|
|
||||||
|
|
||||||
|
def get_topics(weights, num_reccomendations):
|
||||||
|
"""
|
||||||
|
Takes in weights as list/tuple, ex: (0.1, 0.2, 0.3)
|
||||||
|
Returns a list of topics
|
||||||
|
"""
|
||||||
|
return random.choices([*range(nlp.NUM_TOPICS)], weights, k=num_reccomendations)
|
Loading…
Reference in New Issue
Block a user