wordcloud

marixko · Oct 21, 2022 · 2e9c90c · 2e9c90c
1 parent 9d4644a
commit 2e9c90c
Show file tree

Hide file tree

Showing 2 changed files with 25 additions and 9 deletions.
diff --git a/arxivfyme.py b/arxivfyme.py
@@ -6,7 +6,7 @@
 from sklearn.feature_extraction.text import TfidfVectorizer
 from nltk.corpus import stopwords
 import streamlit as st
-from utils import clean, show_wordcloud, get_paper_information, give_recomm  
+from utils import cleanv2, show_wordcloud, get_paper_information, give_recomm  
 from PIL import Image
 import dill as pickle
 
@@ -43,20 +43,15 @@
 # text = " ".join(summ for summ in df_pandas.tokens_str.astype(str))
 
 # df = pd.read_json('astro_ph_2022.json')
-# tokens = df["abstract"].agg(clean,lemma=True, stem=False)
-# df["tokens"] = tokens
-# df['tokens_str'] = df['tokens'].apply(lambda x: ','.join(map(str, x)))
-# text = " ".join(summ for summ in df.tokens_str.astype(str))
 
-# fig = show_wordcloud(text, st.slider('max_words', 5, 500, 200, step = 10))
-# st.pyplot(fig)
 
 
 df_astro = pd.read_json("astro_ph_2022.json")#[:N_max]
 df_bio = pd.read_json("q_bio_2022.json")
 df = pd.concat([df_astro, df_bio])
 df.reset_index(inplace=True)
 
+
 # X = vectorizer.fit_transform(tokens)
 # features = vectorizer.get_feature_names()
 
@@ -71,10 +66,21 @@
 data = get_paper_information("2207.00322")
 give_recomm(data["abstract"], vectorizer, df, 5)
 
+
 st.header("See what arXivfyMe recommends you today!")
 
 st.markdown("Based on any article, this app will check what are the most recommended articles for you to check out.")
 id = st.text_input("Write down an arXiv's ID (e.g. it can be one of your published articles or one that you really like):")
 n = st.sidebar.slider("Number of recommendations", 0,50,5)
 data = get_paper_information(id)
-give_recomm(data["abstract"], vectorizer,df, n )
+output = give_recomm(data["abstract"], vectorizer,df, n )
+
+
+st.header("Wordcloud")
+tokens = output["abstract"].agg(cleanv2)
+output["tokens"] = tokens
+output['tokens_str'] = output['tokens'].apply(lambda x: ','.join(map(str, x)))
+text = " ".join(summ for summ in output.tokens_str.astype(str))
+
+fig = show_wordcloud(text, st.slider('max_words', 10, 200, 50, step = 10))
+st.pyplot(fig)
diff --git a/utils.py b/utils.py
@@ -80,6 +80,16 @@ def clean(s):
     s = stem(s)
     return s
 
+def cleanv2(s):
+    s = remove_latex(s)
+    s = remove_punctuation(s)
+    s = remove_linebreaks(s)
+    s = tokenize(s)
+    s = remove_stopwords(s)
+    s = lemmatizer(s)
+
+    return s
+
 def show_wordcloud(data, maxwords):
     cloud = WordCloud(
         background_color='white',
@@ -153,4 +163,4 @@ def give_recomm(data, vectorizer, df, n=5):
     newdf = newdf.iloc[1:,]
     st.write(newdf["title"].head(n))
 
-    return
+    return newdf.head(n)