Skip to content

Commit

Permalink
wordcloud
Browse files Browse the repository at this point in the history
  • Loading branch information
marixko committed Oct 21, 2022
1 parent 9d4644a commit 2e9c90c
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 9 deletions.
22 changes: 14 additions & 8 deletions arxivfyme.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import streamlit as st
from utils import clean, show_wordcloud, get_paper_information, give_recomm
from utils import cleanv2, show_wordcloud, get_paper_information, give_recomm
from PIL import Image
import dill as pickle

Expand Down Expand Up @@ -43,20 +43,15 @@
# text = " ".join(summ for summ in df_pandas.tokens_str.astype(str))

# df = pd.read_json('astro_ph_2022.json')
# tokens = df["abstract"].agg(clean,lemma=True, stem=False)
# df["tokens"] = tokens
# df['tokens_str'] = df['tokens'].apply(lambda x: ','.join(map(str, x)))
# text = " ".join(summ for summ in df.tokens_str.astype(str))

# fig = show_wordcloud(text, st.slider('max_words', 5, 500, 200, step = 10))
# st.pyplot(fig)


df_astro = pd.read_json("astro_ph_2022.json")#[:N_max]
df_bio = pd.read_json("q_bio_2022.json")
df = pd.concat([df_astro, df_bio])
df.reset_index(inplace=True)


# X = vectorizer.fit_transform(tokens)
# features = vectorizer.get_feature_names()

Expand All @@ -71,10 +66,21 @@
data = get_paper_information("2207.00322")
give_recomm(data["abstract"], vectorizer, df, 5)


st.header("See what arXivfyMe recommends you today!")

st.markdown("Based on any article, this app will check what are the most recommended articles for you to check out.")
id = st.text_input("Write down an arXiv's ID (e.g. it can be one of your published articles or one that you really like):")
n = st.sidebar.slider("Number of recommendations", 0,50,5)
data = get_paper_information(id)
give_recomm(data["abstract"], vectorizer,df, n )
output = give_recomm(data["abstract"], vectorizer,df, n )


st.header("Wordcloud")
tokens = output["abstract"].agg(cleanv2)
output["tokens"] = tokens
output['tokens_str'] = output['tokens'].apply(lambda x: ','.join(map(str, x)))
text = " ".join(summ for summ in output.tokens_str.astype(str))

fig = show_wordcloud(text, st.slider('max_words', 10, 200, 50, step = 10))
st.pyplot(fig)
12 changes: 11 additions & 1 deletion utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,16 @@ def clean(s):
s = stem(s)
return s

def cleanv2(s):
s = remove_latex(s)
s = remove_punctuation(s)
s = remove_linebreaks(s)
s = tokenize(s)
s = remove_stopwords(s)
s = lemmatizer(s)

return s

def show_wordcloud(data, maxwords):
cloud = WordCloud(
background_color='white',
Expand Down Expand Up @@ -153,4 +163,4 @@ def give_recomm(data, vectorizer, df, n=5):
newdf = newdf.iloc[1:,]
st.write(newdf["title"].head(n))

return
return newdf.head(n)

0 comments on commit 2e9c90c

Please sign in to comment.