-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathannotate.py
138 lines (108 loc) · 7.1 KB
/
annotate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import random
import json
import os
from typing import Dict, Any
from openai import OpenAI
from openai.types import CompletionChoice
MODEL = "gpt-3.5-turbo-1106"
PREAMBLE = """A good summary is a shorter piece of text that has the essence of the original. It tries to accomplish the same purpose and conveys the key information from the original post. Below we define four evaluation axes for summary quality: coherence, accuracy, coverage, and overall quality.
Coherence: This axis answers the question “how coherent is the summary on its own?” A summary is coherent if it's easy to understand when read on its own and free of English errors. A summary is not coherent if it's difficult to understand what the summary is trying to say. Generally, it's more important that the summary is understandable than it being free of grammar errors.
Accuracy: This axis answers the question “does the factual information in the summary accurately match the post?” A summary is accurate if it doesn't say things that aren't in the article, it doesn't mix up people, and generally is not misleading.
Coverage: This axis answers the question “how well does the summary cover the important information in the post?” A summary has good coverage if it mentions the main information from the post that's important to understand the situation described in the post. A summary has poor coverage if someone reading only the summary would be missing several important pieces of information about the situation in the post. A summary with good coverage should also match the purpose of the original post (e.g. to ask for advice).
Overall quality: This axis answers the question “how good is the summary overall at representing the post?” This can encompass all of the above axes of quality, as well as others you feel are important. If it's hard to find ways to make the summary better, the overall quality is good. If there are lots of different ways the summary can be made better, the overall quality is bad.
You are an expert summary rater. Given a piece of text and two of its possible summaries, output 1 or 2 to indicate which summary best adheres to coherence, accuracy, coverage, and overall quality as defined above."""
FEW_SHOT_PREAMBLE = """Text: We were best friends over 4 years and dated over 3 years and just broke up before she moved for grad school. But things ended in a weird way, and it's only been 5 days since I last texted her. Her birthday is the 28th and was wondering if I should wish my ex happy birthday and what everyone thinks? Break no contact? It's a complicated story but the main reason I got myself here is from being too needy and not giving her enough space. Shes an introvert and I really smothered her, they need to feel they can get away when they need to and not feel bad about it and I was like a ball and chain for her emotionally. I don't want her to think I'll keep being that guy.
Summary 1: Broke up with best friend, should I wish her a happy birthday... And what do you think of no contact?
Summary 2: should I wish my ex happy birthday, I broke no contact, I'm trying to be more patient, I'm too needy, and I don't want her to think I'll keep being that guy.
Thoughts on Summary 1:
Coherence - 7. Rationale: The summary is generally understandable, though it could be written with better grammar.
Accuracy - 9. Rationale: The summary doesn't say things that aren't in the original text, and isn't misleading.
Coverage - 6. Rationale: The summary covers most of the important information in the post and conveys the gist of the original text. However, it places more emphasis on "no contact" and could have mentioned the smothering/neediness to be more complete.
Overall Quality - 7. Rationale: The summary represents the post fairly well with only minor areas where it could be improved.
Thoughts on Summary 2:
Coherence - 3. Rationale: The summary is long-winded and has several grammatical errors.
Accuracy - 4. Rationale: The summary mentions that the author broke no contact, but this is incorrect. Otherwise, it is accurate.
Coverage - 8. Rationale: The summary covers the key points in the original text.
Overall Quality - 4. Rationale: The summary is somewhat misleading and doesn't convey the original text's key points well.
Output:
```json
{
"preference": 1,
"reason": "Summary 1 is more coherent and accurate. Summary 2 is too long-winded and has several grammatical errors."
}
```
"""
def construct_query_message(post: str, chosen: str, rejected: str):
"""Construct the query message for the GPT-3 API.
Args:
post: str, the post
chosen: str, the chosen summary
rejected: str, the rejected summary
Returns:
message: str, the query message
"""
summary_list = [chosen, rejected]
idx_list = [0, 1]
random.shuffle(idx_list)
if idx_list[0] == 0:
ans = 1
else:
ans = 2
msg = PREAMBLE + '\n\n' + FEW_SHOT_PREAMBLE + '\n\n' + f"""{post}\n\Summary 1: {summary_list[idx_list[0]]}\n\Summary 2: {summary_list[idx_list[1]]}\n\nPlease just strictly output a JSON string, which has following keys:\n\n- preference: int, 1 if you prefer Summary 1, 2 if you prefer Summary 2\n- reason: str, the brief (less than 50 words) reason why you give the above preference\n"""
return msg, ans
def get_completions(message: str, api_key: str, n: int = 1):
"""Get the logprob of the message.
Args:
message: str, the message to be evaluated
api_key: str, the API key
n: int, the number of completions to generate
Returns:
logprob: float, the logprob of the message
"""
client = OpenAI(api_key=api_key)
completion = client.chat.completions.create(
model=MODEL,
messages=[
{"role": "system", "content": PREAMBLE},
{"role": "user", "content": message},
],
temperature=0.0,
n=n,
)
return completion.choices
def annotate_tldr_post(post: Dict[str, str], api_key: str) -> int:
"""Annotate the post.
Args:
post: dict, the post dictionary of the following format
{
'post': a string of the post
'chosen': a string of the chonse summary
'rejected': a string of the rejected summary
}
api_key: str, the API key
Returns:
accuracy: int, 1 if the chosen summary is better than the rejected, 0
if the rejected summary is better than the chosen, -1 if the annotator
output is invalid
query_msg: str, the query message
result: str, the result from the annotator
"""
post_text = post['post']
chosen_text = post['chosen']
rejected_text = post['rejected']
query_msg, ans = construct_query_message(
post_text, chosen_text, rejected_text
)
completions = get_completions(query_msg, api_key, n=1)
result = completions[0].message.content
try:
result = json.loads(result)
choice = int(result['preference'])
except ValueError:
return -1, query_msg, result
if not choice in [1, 2]:
return -1, query_msg, result
elif choice == ans:
return 1, query_msg, result
else:
return 0, query_msg, result