mirror of
https://github.com/Rushilwiz/think-openly.git
synced 2025-04-20 11:10:17 -04:00
Create basic code to select topic, turn nlp into api
This commit is contained in:
parent
3141b3c1cd
commit
44f70ccda1
50
nlp.py
50
nlp.py
|
@ -1,6 +1,3 @@
|
|||
|
||||
|
||||
import json
|
||||
import pandas as pd
|
||||
import nltk
|
||||
import numpy as np
|
||||
|
@ -9,22 +6,14 @@ from nltk.stem import WordNetLemmatizer, SnowballStemmer
|
|||
from gensim.parsing.preprocessing import STOPWORDS
|
||||
from gensim.utils import simple_preprocess
|
||||
import gensim
|
||||
|
||||
|
||||
from sklearn.datasets import fetch_20newsgroups
|
||||
newsgroups_train = fetch_20newsgroups(subset='train', shuffle=True)
|
||||
newsgroups_test = fetch_20newsgroups(subset='test', shuffle=True)
|
||||
|
||||
newsgroups_train = fetch_20newsgroups(subset='train')
|
||||
newsgroups_test = fetch_20newsgroups(subset='test')
|
||||
|
||||
np.random.seed(400)
|
||||
|
||||
|
||||
stemmer = SnowballStemmer("english")
|
||||
|
||||
# context = ssl._create_unverified_context()
|
||||
|
||||
|
||||
def get_data():
|
||||
pass
|
||||
NUM_TOPICS = 7
|
||||
|
||||
|
||||
def lemmatize_stemming(text):
|
||||
|
@ -41,11 +30,13 @@ def preprocess(text):
|
|||
return result
|
||||
|
||||
|
||||
def categorize_str(s: str) -> int:
|
||||
def categorize_str(s: str, lda_model) -> int:
|
||||
"""
|
||||
Takes in a string to determine which topic it belongs to
|
||||
Returns the topic number as an int
|
||||
"""
|
||||
processed_doc = preprocess(s)
|
||||
dictionary = gensim.corpora.Dictionary([processed_doc])
|
||||
bow_vector = dictionary.doc2bow(preprocess(s))
|
||||
ldaResults = sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1])
|
||||
return ldaResults[0][0]
|
||||
|
@ -54,25 +45,32 @@ def categorize_str(s: str) -> int:
|
|||
def create_model(documents: list):
|
||||
"""
|
||||
Takes a list of strings to create model
|
||||
returns the lda model and dictionary
|
||||
returns the lda model
|
||||
"""
|
||||
processed_docs = []
|
||||
for doc in newsgroups_train.data:
|
||||
for doc in documents:
|
||||
processed_docs.append(preprocess(doc))
|
||||
dictionary = gensim.corpora.Dictionary(processed_docs)
|
||||
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
|
||||
lda_model = gensim.models.LdaMulticore(bow_corpus,
|
||||
num_topics=7,
|
||||
num_topics=NUM_TOPICS,
|
||||
id2word=dictionary,
|
||||
passes=10,
|
||||
workers=2)
|
||||
return(lda_model, dictionary)
|
||||
return lda_model
|
||||
|
||||
|
||||
lda_model, dictionary = create_model(newsgroups_train.data)
|
||||
for idx, topic in lda_model.show_topics(formatted=False, num_words=30):
|
||||
print('Topic: {} \nWords: {}'.format(idx, [w[0] for w in topic]))
|
||||
def update_model(s: str, lda_model):
|
||||
"""
|
||||
Takes in a string to update model
|
||||
Trains model using string
|
||||
"""
|
||||
processed_doc = preprocess(s)
|
||||
dictionary = gensim.corpora.Dictionary([processed_doc])
|
||||
bow_corpus = [dictionary.doc2bow(processed_doc)]
|
||||
lda_model.update(bow_corpus)
|
||||
|
||||
for ind in range(len(newsgroups_test)):
|
||||
unseenDoc = newsgroups_test.data[ind]
|
||||
print(ind, categorize_str(unseenDoc))
|
||||
|
||||
# lda_model = create_model(newsgroups_train.data)
|
||||
# update_model("Hello everyone", lda_model)
|
||||
# print(categorize_str("Hello world", lda_model))
|
||||
|
|
17
selector.py
Normal file
17
selector.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
import nlp
|
||||
import random
|
||||
|
||||
# get user preference from database (i.e. how many times they clicked on some certain type of article)
|
||||
# prob = [1/nlp.NUM_TOPICS for i in range(nlp.NUM_TOPICS)]
|
||||
# manipulate prob based on user preference
|
||||
|
||||
|
||||
def get_topics(weights, num_reccomendations):
|
||||
"""
|
||||
Takes in weights as list/tuple, ex: (0.1, 0.2, 0.3)
|
||||
Returns a list of topics
|
||||
"""
|
||||
return random.choices([*range(nlp.NUM_TOPICS)], weights, k=num_reccomendations)
|
||||
|
||||
|
||||
# print(get_topics(prob, 4))
|
Loading…
Reference in New Issue
Block a user