-
Notifications
You must be signed in to change notification settings - Fork 0
/
integration.sh
executable file
·211 lines (184 loc) · 6.53 KB
/
integration.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
#!/bin/bash
set -eu
TESTING_DIR=$(mktemp --directory refpapers.XXXXXXXXXX --tmpdir=/tmp)
CONF_DIR="${TESTING_DIR}/conf"
INDEX_DIR="${TESTING_DIR}/index"
DATA_DIR="${TESTING_DIR}/data"
INBOX_DIR="${TESTING_DIR}/inbox"
pushd "${TESTING_DIR}"
echo "Performing integration test in ${TESTING_DIR}"
### Set up
mkdir -p "${CONF_DIR}"
mkdir -p "${INDEX_DIR}"
mkdir -p "${DATA_DIR}/topicA"
mkdir -p "${DATA_DIR}/topicB"
# some files to index
touch "${DATA_DIR}/topicA/A_-_T_2021.pdf"
touch "${DATA_DIR}/topicA/Author_-_Title_book_2021.pdf"
touch "${DATA_DIR}/topicB/Author_-_MoreTitle_survey_2020.pdf"
touch "${DATA_DIR}/topicA/Not_-_Shown_1900.pdf"
# dummy version forces reindexing
echo "dummy-version" > ${CONF_DIR}/schema_version
echo ""
echo "### Create a minimal conf, perform full indexing"
echo "'Extraction failed' messages are ok: dummy files are empty"
echo "refpapers index --confdir ${CONF_DIR}"
refpapers index --confdir "${CONF_DIR}" <<CREATEMINIMAL
y
${DATA_DIR}
${INDEX_DIR}
CREATEMINIMAL
echo ""
echo "### Perform a search (expecting 2 results)"
echo "refpapers search --confdir ${CONF_DIR} author"
refpapers search --confdir "${CONF_DIR}" author
echo ""
echo "### Perform a search (expecting 0 results)"
echo "refpapers search --confdir ${CONF_DIR} nosuch"
refpapers search --confdir "${CONF_DIR}" nosuch
echo ""
echo "### Show details using bibtex key"
echo "refpapers one --confdir ${CONF_DIR} author2021title"
refpapers one --confdir "${CONF_DIR}" author2021title
echo ""
echo "### Add one, delete one, then perform (full) indexing"
echo "rm ${DATA_DIR}/topicB/Author_-_MoreTitle_survey_2020.pdf"
rm "${DATA_DIR}/topicB/Author_-_MoreTitle_survey_2020.pdf"
echo "touch ${DATA_DIR}/topicB/Second_-_Phase_2020.pdf"
touch "${DATA_DIR}/topicB/Second_-_Phase_2020.pdf"
echo "refpapers index --confdir ${CONF_DIR}"
refpapers index --confdir "${CONF_DIR}"
echo ""
echo "### author2020more should no longer be returned"
echo "refpapers search --confdir ${CONF_DIR} author2020more"
refpapers search --confdir "${CONF_DIR}" author2020more
echo ""
echo "### but second2020phase should be found"
echo "refpapers search --confdir ${CONF_DIR} second"
refpapers search --confdir "${CONF_DIR}" second
echo ""
echo "### switching to git tracking"
tee ${CONF_DIR}/conf.yml <<GITCONF
fulltext_chars: 300000
extract_max_seconds: 1.0
use_git: True
git_uncommitted: "WARN"
paths:
index: "${INDEX_DIR}"
data: "${DATA_DIR}"
log: "${CONF_DIR}/log"
software:
viewers:
pdf: "evince"
djvu: "evince"
extractors:
pdf: "pdftotext -l 20"
djvu: "None"
GITCONF
pushd ${DATA_DIR}
git init
git add *
git commit -a -m "initial"
popd
echo ""
echo "### perform full reindexing"
echo "refpapers index --full --confdir ${CONF_DIR}"
refpapers index --full --confdir "${CONF_DIR}"
echo ""
echo "### Add one, delete one, modify one, then perform incremental indexing"
echo "# expecting one staged and one untracked warning"
echo "rm ${DATA_DIR}/topicB/Second_-_Phase_2020.pdf"
rm "${DATA_DIR}/topicB/Second_-_Phase_2020.pdf"
echo "git -C ${DATA_DIR} commit -a -m remove old"
git -C ${DATA_DIR} commit -a -m "remove old"
echo "touch ${DATA_DIR}/topicB/Third_-_Phase_2020.pdf"
touch "${DATA_DIR}/topicB/Third_-_Phase_2020.pdf"
echo "echo modified > ${DATA_DIR}/topicA/Author_-_Title_book_2021.pdf"
echo "modified" > "${DATA_DIR}/topicA/Author_-_Title_book_2021.pdf"
echo "git -C ${DATA_DIR} add ${DATA_DIR}/topicB/Third_-_Phase_2020.pdf"
git -C ${DATA_DIR} add "${DATA_DIR}/topicB/Third_-_Phase_2020.pdf"
echo "git -C ${DATA_DIR} commit -a -m phase 3"
git -C ${DATA_DIR} commit -a -m "phase 3"
echo "touch ${DATA_DIR}/topicA/Warn_-_Uncommitted_2021.pdf"
touch "${DATA_DIR}/topicA/Warn_-_Uncommitted_2021.pdf"
echo "touch ${DATA_DIR}/topicA/Warn_-_Untracked_2021.pdf"
touch "${DATA_DIR}/topicA/Warn_-_Untracked_2021.pdf"
echo "git -C ${DATA_DIR} add ${DATA_DIR}/topicA/Warn_-_Uncommitted_2021.pdf"
git -C ${DATA_DIR} add "${DATA_DIR}/topicA/Warn_-_Uncommitted_2021.pdf"
echo "refpapers index --confdir ${CONF_DIR}"
refpapers index --confdir "${CONF_DIR}"
echo ""
echo "### second2020phase should no longer be returned"
echo "refpapers search --confdir ${CONF_DIR} second2020phase"
refpapers search --confdir "${CONF_DIR}" second2020phase
echo ""
echo "### but third2020phase should be found"
echo "refpapers search --confdir ${CONF_DIR} third"
refpapers search --confdir "${CONF_DIR}" third
echo ""
echo "### original files should be found (expecting 1 result)"
echo "refpapers search --confdir ${CONF_DIR} author"
refpapers search --confdir "${CONF_DIR}" author
echo ""
echo "### testing inbox feature"
# .txt is added as a dummy format for easy fulltext extraction
tee ${CONF_DIR}/conf.yml <<GITCONF
fulltext_chars: 300000
ids_chars: 5000
extract_max_seconds: 1.0
use_git: True
use_git_annex: True
git_uncommitted: "WARN"
paths:
index: "${INDEX_DIR}"
data: "${DATA_DIR}"
log: "${CONF_DIR}/log"
api_cache: "${CONF_DIR}/api_cache"
software:
viewers:
pdf: "evince"
djvu: "evince"
txt: "cat"
extractors:
pdf: "pdftotext -l 20"
djvu: "None"
txt: "cat"
GITCONF
pushd ${DATA_DIR}
echo "git -C "${DATA_DIR}" annex init"
git -C "${DATA_DIR}" annex init
popd
# Preparing API cache: comment these out to also test retrieval
echo "# Preparing API cache..."
mkdir -p "${CONF_DIR}/api_cache"
echo '["2004.04002", {"title": "Transfer learning and subword sampling for asymmetric-resource one-to-many neural translation", "year": 2020, "authors": ["Gr\u00f6nroos", "Virpioja", "Kurimo"], "doi": null, "arxiv": "2004.04002"}]' > ${CONF_DIR}/api_cache/arxiv.jsonl
echo '["10.1101/708206", {"title": "Machine translation of cortical activity to text with an encoder-decoder framework", "year": 2019, "authors": ["Makin", "Moses", "Chang"], "doi": "10.1101/708206"}]' > ${CONF_DIR}/api_cache/crossref.jsonl
echo "# Creating dummy inbox papers..."
mkdir -p "${INBOX_DIR}"
pushd ${INBOX_DIR}
# file name containing space
echo -e "This file can be searched from crossref\ndoi: 10.1101/708206" > "doi given.txt"
echo -e "This file can be searched from arxiv using the id arXiv:2004.04002" > arxiv.txt
echo "No identifiers" > noid.txt
echo "# Ingesting inbox..."
echo "refpapers inbox --confdir ${CONF_DIR}"
refpapers inbox --confdir ${CONF_DIR} <<INBOX
misc
y
foo/bar
y
Manual title entered by hand
2022
Doe Anonymous
foo/bar
y
INBOX
popd
echo ""
echo "### ingested files should be found (expecting 1 result)"
echo "refpapers search --confdir ${CONF_DIR} Kurimo"
refpapers search --confdir "${CONF_DIR}" Kurimo
popd
#rm -r "${TESTING_DIR}"
echo ""
echo "Integration test ran to completion"