-
Notifications
You must be signed in to change notification settings - Fork 2
/
utils.py
89 lines (76 loc) · 2.57 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import re
from io import StringIO
from typing import BinaryIO, List, cast
import openai
from cleantext import clean
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage
from pdfminer.utils import FileOrName, open_filename
# TODO Remove if dependency on langchain stays
def pdf2text(pdf_file: FileOrName):
# pdfminer Boilerplate
rsrcmgr = PDFResourceManager()
sio = StringIO()
laparams = LAParams()
device = TextConverter(rsrcmgr, sio, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Extract text
with open_filename(pdf_file, "rb") as fp:
fp = cast(BinaryIO, fp)
for page in PDFPage.get_pages(fp):
interpreter.process_page(page)
# Get text from StringIO and remove nonprintable characters and trailing stuff
text = sio.getvalue()
text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff]", "", text).rstrip()
device.close()
sio.close()
return text
def chunk_text(text: str, chunk_len: int = 256, do_overlap: bool = False, overlap_size=15) -> List[str]:
# Split text into smaller chunks
chunks = []
i = 0
while i < len(text):
chunks.append(text[i : i + chunk_len])
if do_overlap:
i = i + chunk_len - overlap_size
else:
i = i + chunk_len
return chunks
def clean_text(text: str, lang: str = "en") -> str:
cleaned_text = clean(
text,
fix_unicode=True,
to_ascii=True,
lower=False,
normalize_whitespace=True,
no_line_breaks=False,
strip_lines=True,
keep_two_line_breaks=False,
no_urls=True,
no_emails=True,
no_phone_numbers=False,
no_numbers=False,
no_digits=False,
no_currency_symbols=False,
no_punct=False,
no_emoji=True,
replace_with_url="<URL>",
replace_with_email="<EMAIL>",
replace_with_phone_number="<PHONE>",
replace_with_number="<NUMBER>",
replace_with_digit="0",
replace_with_currency_symbol="<CUR>",
replace_with_punct="",
lang=lang,
)
return cleaned_text
def get_gai_completion(prompt, model="gpt-3.5-turbo", temperature=0):
messages = [{"role": "user", "content": prompt}]
response = openai.ChatCompletion.create(
model=model,
messages=messages,
temperature=temperature, # model's ramdomness degree for output
)
return response.choices[0].message["content"]