Skip to content

Commit

Permalink
adjust for streamlit
Browse files Browse the repository at this point in the history
  • Loading branch information
marixko committed Oct 20, 2022
1 parent 07376cc commit 8b32282
Show file tree
Hide file tree
Showing 3 changed files with 110 additions and 58 deletions.
44 changes: 44 additions & 0 deletions arxivfyme.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import pandas as pd
import numpy as np
import streamlit as st
from nltk import word_tokenize
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer
import nltk
import string
import re
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import streamlit as st

nltk.download("omw-1.4")
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

st.title('arXivfy me')

stemmer = PorterStemmer()
vectorizer = TfidfVectorizer()
stpwrds = set(stopwords.words("english"))

# read with pandas
df_pandas = pd.read_json('arxivData.json')

# convert string to python object
for key in ["author", "link", "tag"]:
df_pandas[key] = df_pandas[key].agg(eval, axis=0)

df_pandas.head()

tokens = df_pandas["summary"].agg(clean)
df_pandas["tokens"] = tokens
df_pandas['tokens_str'] = df_pandas['tokens'].apply(lambda x: ','.join(map(str, x)))
text = " ".join(summ for summ in df_pandas.tokens_str.astype(str))
show_wordcloud(text)

fig = show_wordcloud(st.slider('max_words', 5, 500, 200, step = 10))
st.pyplot(fig)
61 changes: 3 additions & 58 deletions dataviz.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from nltk.corpus import stopwords\n",
"from wordcloud import WordCloud\n",
"import matplotlib.pyplot as plt"
"import matplotlib.pyplot as plt\n",
"from utils import show_wordcloud"
]
},
{
Expand Down Expand Up @@ -248,22 +249,7 @@
"metadata": {},
"outputs": [],
"source": [
"stopwords = set(nltk.corpus.stopwords.words(\"english\"))\n",
"\n",
"def remove_linebreaks(s):\n",
" return s.replace(\"\\n\", \" \")\n",
"\n",
"def tokenize(s):\n",
" return word_tokenize(s, language=\"english\")\n",
"\n",
"def remove_stopwords(s):\n",
" return [w for w in s if not w in stpwrds]\n",
"\n",
"def stem(s):\n",
" return \" \".join([stemmer.stem(w.lower()) for w in s])\n",
"\n",
"def vectorize(s):\n",
" return vectorizer.fit_transform(s)\n"
"stopwords = set(nltk.corpus.stopwords.words(\"english\"))\n"
]
},
{
Expand All @@ -275,23 +261,6 @@
"sample = \"We propose an architecture for VQA which utilizes recurrent layers to generate visual and textual attention. The memory characteristic of the proposed recurrent attention units offers a rich joint embedding of visual and textual features and enables the model to reason relations between several parts of the image and question. Our single model outperforms the first place winner on the VQA 1.0 dataset, performs within margin to the current state-of-the-art ensemble model. We also experiment with replacing attention mechanisms in other state-of-the-art models with our implementation and show increased accuracy??\""
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"def clean(s):\n",
" s = re.sub(r'\\d+', '', s) # remove numbers\n",
" s = \"\".join([char.lower() for char in s if char not in string.punctuation]) # remove punctuations and convert characters to lower case\n",
" s = re.sub('\\s+', ' ', s).strip() # substitute multiple whitespace with single whitespace\n",
" s = remove_linebreaks(s)\n",
" s = tokenize(s)\n",
" s = remove_stopwords(s)\n",
" # s = stem(s)\n",
" return s"
]
},
{
"cell_type": "code",
"execution_count": 9,
Expand All @@ -301,30 +270,6 @@
"tokens = df_pandas[\"summary\"].agg(clean)\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"def show_wordcloud(data):\n",
" cloud = WordCloud(\n",
" background_color='white',\n",
" stopwords=stopwords,\n",
" max_words=100,\n",
" max_font_size=30,\n",
" scale=3,\n",
" random_state=1)\n",
" \n",
" output=cloud.generate(str(data))\n",
"\n",
" fig = plt.figure(1, figsize=(12, 12))\n",
" plt.axis('off')\n",
"\n",
" plt.imshow(output)\n",
" plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 20,
Expand Down
63 changes: 63 additions & 0 deletions utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import pandas as pd
import numpy as np
import streamlit as st
from nltk import word_tokenize
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer
import string
import re
from nltk.corpus import stopwords
from wordcloud import WordCloud
import matplotlib.pyplot as plt

def get_top_ngram(corpus, n=None):
vec = CountVectorizer(ngram_range=(n, n)).fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx])
for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:10]

def remove_linebreaks(s):
return s.replace("\n", " ")

def tokenize(s):
return word_tokenize(s, language="english")

def remove_stopwords(s):
return [w for w in s if not w in stpwrds]

def stem(s):
return " ".join([stemmer.stem(w.lower()) for w in s])

def vectorize(s):
return vectorizer.fit_transform(s)

def clean(s):
s = re.sub(r'\d+', '', s) # remove numbers
s = "".join([char.lower() for char in s if char not in string.punctuation]) # remove punctuations and convert characters to lower case
s = re.sub('\s+', ' ', s).strip() # substitute multiple whitespace with single whitespace
s = remove_linebreaks(s)
s = tokenize(s)
s = remove_stopwords(s)
# s = stem(s)
return s


def show_wordcloud(data, max_words):
cloud = WordCloud(
background_color='white',
stopwords=stopwords,
max_words=100,
max_font_size=30,
scale=3,
random_state=1)

output=cloud.generate(str(data))

fig = plt.figure(1, figsize=(12, 12))
plt.axis('off')

plt.imshow(output)
plt.show()

0 comments on commit 8b32282

Please sign in to comment.