-
Notifications
You must be signed in to change notification settings - Fork 0
/
summarizer.py
62 lines (50 loc) · 1.75 KB
/
summarizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import os
import nltk
import openai
if not os.getenv("API_KEY"):
raise Exception("Setup API_KEY as environment variables!")
openai.api_key = os.getenv("API_KEY")
ENGINE = os.getenv("ENGINE", "text-davinci-003")
TLDR_POSTFIX = "\n tl;dr:"
SUMMARIZE_PREFIX = "Summarize this for an expert:\n\n"
def summarize(input_text, max_length=60):
gpt3_prompt = SUMMARIZE_PREFIX + input_text + TLDR_POSTFIX
response = openai.Completion.create(
engine=ENGINE,
prompt=gpt3_prompt,
temperature=0.7,
max_tokens=max_length,
top_p=1.0,
frequency_penalty=0.0,
presence_penalty=0.0,
)
batch_summary = response["choices"][0]["text"]
return post_processing(batch_summary)
def generate_summary(input_text, batch_size=512, max_length=140):
sentences = nltk.tokenize.sent_tokenize(input_text)
tokens = 0
batch_sentence = ""
batches = []
for sentence in sentences:
tokens = tokens + len(nltk.word_tokenize(sentence))
if tokens <= batch_size:
batch_sentence = batch_sentence + sentence
else:
batches.append(batch_sentence)
batch_sentence = sentence
tokens = len(nltk.word_tokenize(sentence))
if batch_sentence not in batches:
batches.append(batch_sentence)
summary = ""
for batch in batches:
response = summarize(batch, max_length)
summary = summary + str(response)
return summary
def post_processing(response_text):
# Incomplete sentence removal - splice until last index of fullstop
try:
fullstop_index = response_text.rindex(".")
response_text = response_text[: fullstop_index + 1]
except Exception as e:
print(e)
return response_text.replace("\\n", "")