Skip to content

Commit

Permalink
Introduced a num_keywords argument here to adjust the number of top k…
Browse files Browse the repository at this point in the history
…eywords to be presented by the Visualizer.
  • Loading branch information
SarthakJShetty committed Nov 2, 2020
1 parent 7189d7f commit cce974d
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 8 deletions.
9 changes: 7 additions & 2 deletions pyResearchInsights/NLP_Engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def lemmatization(status_logger_name, textual_data, allowed_postags=['NOUN', 'AD

return texts_out

def nlp_engine_main(abstracts_log_name, status_logger_name, num_topics = None):
def nlp_engine_main(abstracts_log_name, status_logger_name, num_topics = None, num_keywords = None):
nlp_engine_main_start_status_key = "Initiating the NLP Engine"
status_logger(status_logger_name, nlp_engine_main_start_status_key)

Expand All @@ -150,6 +150,11 @@ def nlp_engine_main(abstracts_log_name, status_logger_name, num_topics = None):
'''If the user has not provided this argument then set to 10'''
num_topics = 10

'''Declaring the number of keywords to be presented by the Visualizer'''
if num_keywords == None:
'''If the user has not provided this argument then set to 20'''
num_keywords = 20

'''Extracts the data from the .txt file and puts them into a Pandas dataframe buckets'''
textual_dataframe = data_reader(abstracts_log_name, status_logger_name)
'''Rids the symbols and special characters from the textual_data'''
Expand Down Expand Up @@ -185,6 +190,6 @@ def nlp_engine_main(abstracts_log_name, status_logger_name, num_topics = None):
status_logger(status_logger_name, nlp_engine_main_end_status_key)

'''Importing the visualizer_main function to view the LDA Model built by the NLP_engine_main() function'''
visualizer_main(lda_model, corpus, id2word, textual_data_lemmatized, num_topics, abstracts_log_name, status_logger_name)
visualizer_main(lda_model, corpus, id2word, textual_data_lemmatized, num_topics, num_keywords, abstracts_log_name, status_logger_name)

return 0
12 changes: 6 additions & 6 deletions pyResearchInsights/Visualizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,15 +39,15 @@ def visualizer_generator(lda_model, corpus, id2word, logs_folder_name, status_lo
visualizer_generator_end_status_key = "Prepared the topic modeling visualization"+" "+logs_folder_name+"/"+"Data_Visualization_Topic_Modelling.html"
status_logger(status_logger_name, visualizer_generator_end_status_key)

def topic_builder(lda_model, topic_order, num_topics, textual_data_lemmatized, logs_folder_name, status_logger_name):
def topic_builder(lda_model, topic_order, num_topics, num_keywords, textual_data_lemmatized, logs_folder_name, status_logger_name):
'''We generate histograms here to present the frequency and weights of the keywords of each topic and save them to the disc for further analysis'''
topic_builder_start_status_key = "Preparing the frequency and weights vs keywords charts"
status_logger(status_logger_name, topic_builder_start_status_key)

'''Setting the colormaps here to generate the num_topics charts that proceed'''
colorchart = cm.get_cmap('plasma', num_topics)

topics = lda_model.show_topics(num_topics = -1, num_words = 20, formatted=False)
topics = lda_model.show_topics(num_topics = -1, num_words = num_keywords, formatted=False)
data_flat = [w for w_list in textual_data_lemmatized for w in w_list]
counter = Counter(data_flat)

Expand All @@ -62,8 +62,8 @@ def topic_builder(lda_model, topic_order, num_topics, textual_data_lemmatized, l

for topic in topic_order:
'''Progressively generating the figures comprising the weights and frequencies for each keyword in each topic'''
fig, ax = plt.subplots(1,1, figsize=[20, 15])
x_axis = [x_axis_element for x_axis_element in range(0, 20)]
fig, ax = plt.subplots(1, 1, figsize=[20, 15])
x_axis = [x_axis_element for x_axis_element in range(0, num_keywords)]

'''Creating the x_axis labels here, which is the topic keywords'''
x_axis_labels = [element for element in df.loc[df.topic_id==topic, 'word']]
Expand Down Expand Up @@ -182,7 +182,7 @@ def trends_histogram(abstracts_log_name, logs_folder_name, trend_keywords, statu
trends_histogram_end_status_key = "Generated the trends graph"+" "+logs_folder_name+"/"+"Data_Visualization_Trends_Graph"+"_"+trend_keywords[0]+".png"
status_logger(status_logger_name, trends_histogram_end_status_key)

def visualizer_main(lda_model, corpus, id2word, textual_data_lemmatized, num_topics, abstracts_log_name, status_logger_name):
def visualizer_main(lda_model, corpus, id2word, textual_data_lemmatized, num_topics, num_keywords, abstracts_log_name, status_logger_name):
visualizer_main_start_status_key = "Entering the visualizer_main() code"
status_logger(status_logger_name, visualizer_main_start_status_key)

Expand All @@ -201,7 +201,7 @@ def visualizer_main(lda_model, corpus, id2word, textual_data_lemmatized, num_top
topic_order = visualizer_generator(lda_model, corpus, id2word, logs_folder_name, status_logger_name)

'''We generate histograms here to present the frequency and weights of the keywords of each topic'''
topic_builder(lda_model, topic_order, num_topics, textual_data_lemmatized, logs_folder_name, status_logger_name)
topic_builder(lda_model, topic_order, num_topics, num_keywords, textual_data_lemmatized, logs_folder_name, status_logger_name)

visualizer_main_end_status_key = "Exiting the visualizer_main() code"
status_logger(status_logger_name, visualizer_main_end_status_key)

0 comments on commit cce974d

Please sign in to comment.