forked from jina-ai/examples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
120 lines (95 loc) · 3.81 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
__copyright__ = "Copyright (c) 2021 Jina AI Limited. All rights reserved."
__license__ = "Apache-2.0"
import os
import sys
import click
from jina.flow import Flow
from jina.logging import JinaLogger
from jina.logging.profile import TimeContext
from jina.logging import default_logger as logger
logger = JinaLogger('wikipedia-example')
MAX_DOCS = int(os.environ.get('JINA_MAX_DOCS', 50))
def config():
os.environ['JINA_DATA_FILE'] = os.environ.get('JINA_DATA_FILE', 'data/toy-input.txt')
os.environ['JINA_DATA_FILE_INC'] = os.environ.get('JINA_DATA_FILE_INC', 'data/toy-input-incremental.txt')
os.environ['JINA_WORKSPACE'] = os.environ.get('JINA_WORKSPACE', 'workspace')
os.environ['JINA_PORT'] = os.environ.get('JINA_PORT', str(45678))
def print_topk(resp, sentence):
for d in resp.search.docs:
print(f'Ta-Dah🔮, here are what we found for: {sentence}')
for idx, match in enumerate(d.matches):
score = match.score.value
if score < 0.0:
continue
print(f'> {idx:>2d}({score:.2f}). {match.text}')
def _index(f, data_fn, num_docs):
with f:
f.logger.info(f'Indexing {os.environ[data_fn]}')
data_path = os.path.join(os.path.dirname(__file__), os.environ.get(data_fn, None))
num_docs = min(num_docs, len(open(data_path).readlines()))
with TimeContext(f'QPS: indexing {num_docs}', logger=f.logger):
f.index_lines(filepath=data_path, batch_size=16, read_mode='r', size=num_docs)
def index(num_docs):
f = Flow().load_config('flows/index.yml')
_index(f, 'JINA_DATA_FILE', num_docs)
def index_incremental(num_docs):
f = Flow().load_config('flows/index_incremental.yml')
for data_fn in ('JINA_DATA_FILE', 'JINA_DATA_FILE_INC'):
_index(f, data_fn, num_docs)
def query(top_k):
def ppr(x):
print_topk(x, text)
f = Flow().load_config('flows/query.yml')
with f:
while True:
text = input('please type a sentence: ')
if not text:
break
f.search_lines(
lines=[
text,
],
line_format='text',
on_done=ppr,
top_k=top_k
)
def query_restful():
f = Flow().load_config('flows/query.yml')
f.use_rest_gateway()
with f:
f.block()
@click.command()
@click.option(
'--task',
'-t',
type=click.Choice(['index', 'index_incremental', 'query', 'query_restful'], case_sensitive=False),
)
@click.option('--num_docs', '-n', default=MAX_DOCS)
@click.option('--top_k', '-k', default=5)
def main(task, num_docs, top_k):
config()
workspace = os.environ['JINA_WORKSPACE']
if 'index' in task:
if os.path.exists(workspace):
logger.error(
f'\n +------------------------------------------------------------------------------------+ \
\n | 🤖🤖🤖 | \
\n | The directory {workspace} already exists. Please remove it before indexing again. | \
\n | 🤖🤖🤖 | \
\n +------------------------------------------------------------------------------------+'
)
sys.exit(1)
if 'query' in task:
if not os.path.exists(workspace):
print(f'The directory {workspace} does not exist. Please index first via `python app.py -t index`')
sys.exit(1)
if task == 'index':
index(num_docs)
elif task == 'index_incremental':
index_incremental(num_docs)
elif task == 'query':
query(top_k)
elif task == 'query_restful':
query_restful()
if __name__ == '__main__':
main()