Skip to content

Commit

Permalink
update files (now working fine!)
Browse files Browse the repository at this point in the history
  • Loading branch information
marixko committed Oct 21, 2022
1 parent 71e5a76 commit c97c7bf
Show file tree
Hide file tree
Showing 5 changed files with 123 additions and 13 deletions.
Binary file added X.pickle
Binary file not shown.
52 changes: 43 additions & 9 deletions arxivfyme.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
from re import I
import pandas as pd
import streamlit as st
import nltk
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import streamlit as st
from utils import clean, show_wordcloud
from utils import clean, show_wordcloud, get_paper_information, give_recomm
import plotly.express as px
from PIL import Image
import dill as pickle

nltk.download("omw-1.4")
nltk.download("punkt")
Expand All @@ -14,7 +18,10 @@

st.title('arXivfy me')
# st.set_option('deprecation.showPyplotGlobalUse', False)

st.markdown("The [arXiv](https://arxiv.org/) is one of the best open-science platform today. It collects and serves about 15 000 new papers per day across all STEM fields. It contains about 2 million scientific publications today. Knowing and reading relevant literature is critical to any scientist's research. However, with the current enormous rate of publications, it is challenging for any scientists to keep up, find what is relevant closer to their interests. It results in sereval very inefficient aspects in the day-to-day work of scientists. The worst possible is when one finds a decade-old paper solving a problem after solving it themselves. There is an opportunity to help our community. Scientific papers contain domain specific words and language that are hard to search using general engines (e.g. Google, Bing, etc.). Domain specific engines exists (e.g. ADS) but their recommendations suffers from using author and citation networks. However, this procedure often leads to a biased view of the research on a given topic, commonly limited to recent work or close network of colleagues. Our goal is to provide a recommendation tool that helps preserve fairness and could help identify more representative research around a problem.")
image = Image.open('logo.png')
st.image(image)

stemmer = PorterStemmer()
vectorizer = TfidfVectorizer()
Expand All @@ -31,17 +38,44 @@

# df_pandas.head()

# tokens = df_pandas["summary"].agg(clean)
# tokens = df_pandas["summary"].agg(clean,lemma=True, stem=False)
# df_pandas["tokens"] = tokens
# df_pandas['tokens_str'] = df_pandas['tokens'].apply(lambda x: ','.join(map(str, x)))
# text = " ".join(summ for summ in df_pandas.tokens_str.astype(str))

df = pd.read_json('astro_ph_2022.json')
tokens = df["abstract"].agg(clean)
df["tokens"] = tokens
df['tokens_str'] = df['tokens'].apply(lambda x: ','.join(map(str, x)))
text = " ".join(summ for summ in df.tokens_str.astype(str))
# df = pd.read_json('astro_ph_2022.json')
# tokens = df["abstract"].agg(clean,lemma=True, stem=False)
# df["tokens"] = tokens
# df['tokens_str'] = df['tokens'].apply(lambda x: ','.join(map(str, x)))
# text = " ".join(summ for summ in df.tokens_str.astype(str))

# fig = show_wordcloud(text, st.slider('max_words', 5, 500, 200, step = 10))
# st.pyplot(fig)


df_astro = pd.read_json("astro_ph_2022.json")#[:N_max]
df_bio = pd.read_json("q_bio_2022.json")
df = pd.concat([df_astro, df_bio])
df.reset_index(inplace=True)

# X = vectorizer.fit_transform(tokens)
# features = vectorizer.get_feature_names()

# vectorizer = pickle.load(open("vectorizer.pickle", "wb"))

with open('vectorizer.pickle', 'rb') as f:
vectorizer = pickle.load(f)


st.header("How does it work?")
st.markdown("Let's use https://arxiv.org/abs/2207.00322 as an example. The ID for this paper is 2207.00322. Let's check the 5 most recommended articles based on our entry.")
data = get_paper_information("2110.13901")
give_recomm(data["abstract"], vectorizer, df, 5)

fig = show_wordcloud(text, st.slider('max_words', 5, 500, 200, step = 10))
st.pyplot(fig)
st.header("See what arXivfyMe recommends you today!")

st.markdown("Based on any article, this app will check what are the most recommended articles for you to check out.")
id = st.text_input("Write down an arXiv's ID (e.g. it can be one of your published articles or one that you really like):")
n = st.sidebar.slider("Number of recommendations", 0,50,5)
data = get_paper_information(id)
give_recomm(data["abstract"], vectorizer,df, n )
7 changes: 6 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,9 @@ pandas
streamlit
nltk
scikit-learn
wordcloud
wordcloud
PIL
bs4
# pickle
scipy
dill
77 changes: 74 additions & 3 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,16 @@
from nltk.corpus import stopwords
from wordcloud import WordCloud
import matplotlib.pyplot as plt

from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import requests
from bs4 import BeautifulSoup
from sklearn.metrics import pairwise_distances
import dill as pickle

nltk.download("stopwords")
stemmer = PorterStemmer()
vectorizer = TfidfVectorizer()
stpwrds = set(stopwords.words("english"))
additional_stopwords = set(('ie', 'eg', 'cf', 'etc', 'et', 'al'))
stpwrds.update(additional_stopwords)
Expand Down Expand Up @@ -65,7 +72,12 @@ def clean(s):
s = remove_linebreaks(s)
s = tokenize(s)
s = remove_stopwords(s)
s = lemmatizer(s)
# if lemma == True and stem==True:
# stem = False
# if lemma:
# s = lemmatizer(s)
# if stem:
s = stem(s)
return s

def show_wordcloud(data, maxwords):
Expand All @@ -82,4 +94,63 @@ def show_wordcloud(data, maxwords):
plt.axis('off')

plt.imshow(output)
plt.show()
plt.show()
return fig


def plot_tsne():
fig = plt.figure()
ax = fig.add_subplot(111, aspect=1)
ax.plot(X_tsne[mask_astro][:, 0], X_tsne[mask_astro][:, 1], ".", alpha=0.5, c="C0", label="Astro")
ax.plot(X_tsne[mask_bio][:, 0], X_tsne[mask_bio][:, 1], ".", alpha=0.5, c="C1", label="Bio")
ax.set_xlabel("t-SNE 1")
ax.set_ylabel("t-SNE 2")
ax.legend()
fig.tight_layout()


def get_paper_information(paper_id:str) -> dict or str:
url = f'https://arxiv.org/abs/{paper_id}'

try:
req = requests.get(url)
req.raise_for_status()
except requests.exceptions.HTTPError as err:
return str(err)

soup = BeautifulSoup(req.text, 'html.parser')
content = soup.find('div', {'id':'abs'})

data = {}

data['title'] = content.find('h1', {'class': 'title mathjax'})
data['authors'] = content.find('div', {'class':'authors'})
data['abstract'] = content.find('blockquote', {'class', 'abstract mathjax'})

# cleaning html
for key, tag in data.items():
tag.span.decompose()
data[key] = tag.text.strip()

data['subject'] = soup.find('div', {'class':'browse'}).find('div', {'class':'current'}).text.strip()

return data

def give_recomm(data, vectorizer, df, n=5):
with open('X.pickle', 'rb') as f:
X = pickle.load(f)

new_input = clean(data)
new_input = vectorizer.transform([data])
# features = vectorizer.get_feature_names()

ndb_dist_i = pairwise_distances(X, new_input)[:, 0]
# sort_ind_i = ndb_dist_i.argsort()
newdf = df.copy(deep=True)
newdf.insert(1, "dist", ndb_dist_i)
newdf.sort_values("dist", ascending=True, inplace=True)
# st.write(sort_ind_i)
newdf = newdf.iloc[1:,]
st.write(newdf["title"].head(n))

return
Binary file added vectorizer.pickle
Binary file not shown.

0 comments on commit c97c7bf

Please sign in to comment.