update files (now working fine!)

marixko · Oct 21, 2022 · c97c7bf · c97c7bf
1 parent 71e5a76
commit c97c7bf
Show file tree

Hide file tree

Showing 5 changed files with 123 additions and 13 deletions.
diff --git a/X.pickle b/X.pickle
diff --git a/arxivfyme.py b/arxivfyme.py
@@ -1,11 +1,15 @@
+from re import I
 import pandas as pd
 import streamlit as st
 import nltk
 from nltk.stem import PorterStemmer
 from sklearn.feature_extraction.text import TfidfVectorizer
 from nltk.corpus import stopwords
 import streamlit as st
-from utils import clean, show_wordcloud
+from utils import clean, show_wordcloud, get_paper_information, give_recomm  
+import plotly.express as px
+from PIL import Image
+import dill as pickle
 
 nltk.download("omw-1.4")
 nltk.download("punkt")
@@ -14,7 +18,10 @@
 
 st.title('arXivfy me')
 # st.set_option('deprecation.showPyplotGlobalUse', False)
+
 st.markdown("The [arXiv](https://arxiv.org/) is one of the best open-science platform today. It collects and serves about 15 000 new papers per day across all STEM fields. It contains about 2 million scientific publications today. Knowing and reading relevant literature is critical to any scientist's research. However, with the current enormous rate of publications, it is challenging for any scientists to keep up, find what is relevant closer to their interests. It results in sereval very inefficient aspects in the day-to-day work of scientists. The worst possible is when one finds a decade-old paper solving a problem after solving it themselves. There is an opportunity to help our community. Scientific papers contain domain specific words and language that are hard to search using general engines (e.g. Google, Bing, etc.). Domain specific engines exists (e.g. ADS) but their recommendations suffers from using author and citation networks. However, this procedure often leads to a biased view of the research on a given topic, commonly limited to recent work or close network of colleagues. Our goal is to provide a recommendation tool that helps preserve fairness and could help identify more representative research around a problem.")
+image = Image.open('logo.png')
+st.image(image)
 
 stemmer = PorterStemmer()
 vectorizer = TfidfVectorizer()
@@ -31,17 +38,44 @@
 
 # df_pandas.head()
 
-# tokens = df_pandas["summary"].agg(clean)
+# tokens = df_pandas["summary"].agg(clean,lemma=True, stem=False)
 # df_pandas["tokens"] = tokens
 # df_pandas['tokens_str'] = df_pandas['tokens'].apply(lambda x: ','.join(map(str, x)))
 # text = " ".join(summ for summ in df_pandas.tokens_str.astype(str))
 
-df = pd.read_json('astro_ph_2022.json')
-tokens = df["abstract"].agg(clean)
-df["tokens"] = tokens
-df['tokens_str'] = df['tokens'].apply(lambda x: ','.join(map(str, x)))
-text = " ".join(summ for summ in df.tokens_str.astype(str))
+# df = pd.read_json('astro_ph_2022.json')
+# tokens = df["abstract"].agg(clean,lemma=True, stem=False)
+# df["tokens"] = tokens
+# df['tokens_str'] = df['tokens'].apply(lambda x: ','.join(map(str, x)))
+# text = " ".join(summ for summ in df.tokens_str.astype(str))
+
+# fig = show_wordcloud(text, st.slider('max_words', 5, 500, 200, step = 10))
+# st.pyplot(fig)
+
+
+df_astro = pd.read_json("astro_ph_2022.json")#[:N_max]
+df_bio = pd.read_json("q_bio_2022.json")
+df = pd.concat([df_astro, df_bio])
+df.reset_index(inplace=True)
+
+# X = vectorizer.fit_transform(tokens)
+# features = vectorizer.get_feature_names()
+
+# vectorizer = pickle.load(open("vectorizer.pickle", "wb"))
+
+with open('vectorizer.pickle', 'rb') as f:
+    vectorizer = pickle.load(f)
+
+
+st.header("How does it work?")
+st.markdown("Let's use https://arxiv.org/abs/2207.00322 as an example. The ID for this paper is 2207.00322. Let's check the 5 most recommended articles based on our entry.")
+data = get_paper_information("2110.13901")
+give_recomm(data["abstract"], vectorizer, df, 5)
 
-fig = show_wordcloud(text, st.slider('max_words', 5, 500, 200, step = 10))
-st.pyplot(fig)
+st.header("See what arXivfyMe recommends you today!")
 
+st.markdown("Based on any article, this app will check what are the most recommended articles for you to check out.")
+id = st.text_input("Write down an arXiv's ID (e.g. it can be one of your published articles or one that you really like):")
+n = st.sidebar.slider("Number of recommendations", 0,50,5)
+data = get_paper_information(id)
+give_recomm(data["abstract"], vectorizer,df, n )
diff --git a/requirements.txt b/requirements.txt
@@ -4,4 +4,9 @@ pandas
 streamlit
 nltk
 scikit-learn
-wordcloud
+wordcloud
+PIL
+bs4
+# pickle
+scipy
+dill
diff --git a/utils.py b/utils.py
@@ -10,9 +10,16 @@
 from nltk.corpus import stopwords
 from wordcloud import WordCloud
 import matplotlib.pyplot as plt
-
+from nltk.stem import PorterStemmer
+from sklearn.feature_extraction.text import TfidfVectorizer
+import requests
+from bs4 import BeautifulSoup
+from sklearn.metrics import pairwise_distances
+import dill as pickle 
 
 nltk.download("stopwords")
+stemmer = PorterStemmer()
+vectorizer = TfidfVectorizer()
 stpwrds = set(stopwords.words("english"))
 additional_stopwords = set(('ie', 'eg', 'cf', 'etc', 'et', 'al'))
 stpwrds.update(additional_stopwords)
@@ -65,7 +72,12 @@ def clean(s):
     s = remove_linebreaks(s)
     s = tokenize(s)
     s = remove_stopwords(s)
-    s = lemmatizer(s)
+    # if lemma == True and stem==True:
+    #     stem = False
+    # if lemma:
+    #     s = lemmatizer(s)
+    # if stem:
+    s = stem(s)
     return s
 
 def show_wordcloud(data, maxwords):
@@ -82,4 +94,63 @@ def show_wordcloud(data, maxwords):
     plt.axis('off')
 
     plt.imshow(output)
-    plt.show()
+    plt.show()
+    return fig
+
+
+def plot_tsne():
+    fig = plt.figure()
+    ax = fig.add_subplot(111, aspect=1)
+    ax.plot(X_tsne[mask_astro][:, 0], X_tsne[mask_astro][:, 1], ".", alpha=0.5, c="C0", label="Astro")
+    ax.plot(X_tsne[mask_bio][:, 0], X_tsne[mask_bio][:, 1], ".", alpha=0.5, c="C1", label="Bio")
+    ax.set_xlabel("t-SNE 1")
+    ax.set_ylabel("t-SNE 2")
+    ax.legend()
+    fig.tight_layout()
+
+
+def get_paper_information(paper_id:str) -> dict or str:
+    url = f'https://arxiv.org/abs/{paper_id}'
+
+    try:
+        req = requests.get(url)
+        req.raise_for_status()
+    except requests.exceptions.HTTPError as err:
+        return str(err)
+
+    soup = BeautifulSoup(req.text, 'html.parser')
+    content = soup.find('div', {'id':'abs'})
+
+    data = {}
+
+    data['title'] = content.find('h1', {'class': 'title mathjax'})
+    data['authors'] = content.find('div', {'class':'authors'})
+    data['abstract'] = content.find('blockquote', {'class', 'abstract mathjax'})
+
+    # cleaning html
+    for key, tag in data.items():
+        tag.span.decompose()
+        data[key] = tag.text.strip()
+
+    data['subject'] = soup.find('div', {'class':'browse'}).find('div', {'class':'current'}).text.strip()
+
+    return data
+
+def give_recomm(data, vectorizer, df, n=5):
+    with open('X.pickle', 'rb') as f:
+        X = pickle.load(f)
+
+    new_input = clean(data)
+    new_input = vectorizer.transform([data])
+    # features = vectorizer.get_feature_names()
+
+    ndb_dist_i = pairwise_distances(X, new_input)[:, 0]
+    # sort_ind_i = ndb_dist_i.argsort()
+    newdf = df.copy(deep=True)
+    newdf.insert(1, "dist", ndb_dist_i)
+    newdf.sort_values("dist", ascending=True, inplace=True)
+    # st.write(sort_ind_i)
+    newdf = newdf.iloc[1:,]
+    st.write(newdf["title"].head(n))
+
+    return
diff --git a/vectorizer.pickle b/vectorizer.pickle