Skip to content

Commit

Permalink
Add OFV market resolver
Browse files Browse the repository at this point in the history
  • Loading branch information
kongzii committed May 7, 2024
1 parent 82036d0 commit 2cff00a
Show file tree
Hide file tree
Showing 5 changed files with 2,492 additions and 800 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -54,4 +54,6 @@ backup_mech/
/packages/valory/skills/termination_abci/
/pip
/tool_test.py
.venv
.venv
log
.benchmark-cache
133 changes: 133 additions & 0 deletions packages/kongzii/customs/ofv_market_resolver/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import typer
import pandas as pd
from packages.kongzii.customs.ofv_market_resolver.ofv_market_resolver import (
run as ofv_run,
)
from packages.napthaai.customs.resolve_market_reasoning.resolve_market_reasoning import (
Results,
run as original_run,
)
from pydantic import SecretStr, ValidationError
from joblib import Memory

# File cache to not re-run the same questions.
MEMORY = Memory(".benchmark-cache", verbose=0)
APP = typer.Typer()

ofv_run_cached = MEMORY.cache(ofv_run)


@MEMORY.cache
def run_original_resolver_cached(
question: str,
openai_api_key: SecretStr,
google_api_key: SecretStr,
google_engine_id: SecretStr,
) -> bool | None:
try:
dump = original_run(
api_keys={
"openai": openai_api_key.get_secret_value(),
"google_api_key": google_api_key.get_secret_value(),
"google_engine_id": google_engine_id.get_secret_value(),
},
tool="resolve-market-reasoning-gpt-4",
prompt=question,
)[0]
return Results.model_validate_json(dump).has_occurred
except ValueError:
return None


@APP.command()
def full(
data_path: str,
openai_api_key: str,
serper_api_key: str,
google_api_key: str,
google_engine_id: str,
) -> None:
"""
Will run the prediction market resolver on all provided data and compare the results.
Expects a tsv file with columns:
- question
- resolution (YES/NO, as currently resolved on Omen)
- my_resolution (YES/NO, as resolved manually by you, used as ground truth)
Example command:
```
python packages/kongzii/customs/ofv_market_resolver/benchmark.py full markets.tsv {openai api key} {serper api key} {google api key} {google engine id}
```
"""
df = pd.read_csv(data_path, sep="\t")

# Run the resolution on all the data.
df["ofv_resolution"] = df["question"].apply(
lambda q: ofv_run_cached(
q,
openai_api_key=SecretStr(openai_api_key),
serper_api_key=SecretStr(serper_api_key),
)
)
df["new_original_resolution"] = df["question"].apply(
lambda q: run_original_resolver_cached(
q,
openai_api_key=SecretStr(openai_api_key),
google_api_key=SecretStr(google_api_key),
google_engine_id=SecretStr(google_engine_id),
)
)
# Normalise boolean to YES/NO/None.
df["ofv_resolution"] = df["ofv_resolution"].apply(
lambda r: "None" if r is None else "YES" if r else "NO"
)
df["new_original_resolution"] = df["new_original_resolution"].apply(
lambda r: "None" if r is None else "YES" if r else "NO"
)
# Save all the predictions and separatelly these that are incorrect.
df.to_csv("markets_resolved.tsv", sep="\t", index=False)
df[df["ofv_resolution"] != df["my_resolution"]].to_csv(
"markets_resolved_incorretly_by_ofv.tsv", sep="\t", index=False
)

# Calculate the accuracy.
accuracy_current = sum(df["resolution"] == df["my_resolution"]) / len(df)
accuracy_new_original = sum(
df["new_original_resolution"] == df["my_resolution"]
) / len(df)
accuracy_ofv = sum(df["ofv_resolution"] == df["my_resolution"]) / len(df)
print(
f"""
Current accuracy: {accuracy_current*100:.2f}%
Original's new run accuracy: {accuracy_new_original * 100:.2f}
OFV's accuracy: {accuracy_ofv*100:.2f}%
"""
)


@APP.command()
def single(
question: str,
openai_api_key: str,
serper_api_key: str,
) -> None:
"""
Will run the prediction market resolver and print the result on a single question.
Example command:
```
python packages/kongzii/customs/ofv_market_resolver/benchmark.py single "Will McDonald's successfully buy back all its Israeli restaurants by 12 April 2024?" {openai api key} {serper api key}
```
"""
ofv_run(
question,
openai_api_key=SecretStr(openai_api_key),
serper_api_key=SecretStr(serper_api_key),
)


if __name__ == "__main__":
APP()
178 changes: 178 additions & 0 deletions packages/kongzii/customs/ofv_market_resolver/ofv_market_resolver.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
from factcheck import FactCheck
from factcheck.utils.multimodal import modal_normalization
from langchain_openai import ChatOpenAI
from typing import Annotated
from pydantic import SecretStr, BaseModel, BeforeValidator

DEFAULT_OPENAI_MODEL = "gpt-4-0125-preview"

Factuality = Annotated[
bool | None,
BeforeValidator(lambda v: None if v in ("Nothing to check.", "non-factual") else v),
]


class FactCheckClaimDetails(BaseModel):
claim: str
factuality: Factuality
correction: str | None
reference_url: str


class FactCheckResult(BaseModel):
factuality: Factuality
claims_details: list[FactCheckClaimDetails] | None


def factcheck(
statement: str,
model: str = DEFAULT_OPENAI_MODEL,
openai_api_key: SecretStr | None = None,
serper_api_key: SecretStr | None = None,
) -> FactCheckResult:
api_config = {
"OPENAI_API_KEY": openai_api_key.get_secret_value(),
"SERPER_API_KEY": serper_api_key.get_secret_value(),
}
factcheck = FactCheck(
default_model=model,
api_config=api_config,
retriever="serper",
num_seed_retries=5,
)
content = modal_normalization("string", statement)
res = factcheck.check_response(content)

return FactCheckResult.model_validate(res)


def rewrite_as_sentence(
question: str,
model: str = DEFAULT_OPENAI_MODEL,
openai_api_key: SecretStr | None = None,
) -> str:
"""
Rewrites the question into a sentence, example:
`Will former Trump Organization CFO Allen Weisselberg be sentenced to jail by 15 April 2024?`
->
`Former Trump Organization CFO Allen Weisselberg was sentenced to jail by 15 April 2024.`
"""
llm = ChatOpenAI(
model=model, temperature=0.0, api_key=openai_api_key.get_secret_value()
)

prompt = f"""
Rewrite the question into a simple annoucment sentence stating a fact or prediction like it is already known.
Make future tense into past tense.
For future questions that ask if something will happen "by" some date, rewrite it to "before" that date or any time sooner.
For future questions that ask if something will happen "on" some date, rewrite it to "on" that date.
If the question is both "on" and "by" some date, rewrite it as "before or any time sooner than" that date.
If the question is about exact date, keep it exact.
If the question is about a date range, keep it a range.
Always keep the same meaning.
Never negate the sentence into opposite meaning of the question.
Question: {question}
Sentence:
"""
completion = str(llm.invoke(prompt, max_tokens=512).content)

return completion


# TODO: This could be imported from prediction-market-agent-tooling, but given the conflict in the langchain versions,
# it would require changes in other mechs of this repository.
def is_predictable_binary(
question: str,
model: str = DEFAULT_OPENAI_MODEL,
openai_api_key: SecretStr | None = None,
) -> str:
"""
Evaluate if the question is actually answerable.
"""
llm = ChatOpenAI(
model=model, temperature=0.0, api_key=openai_api_key.get_secret_value()
)

prompt = f"""Main signs about a fully qualified question (sometimes referred to as a "market"):
- The market's question needs to be specific, without use of pronouns.
- The market's question needs to have a clear future event.
- The market's question needs to have a clear time frame.
- The event in the market's question doesn't have to be ultra-specific, it will be decided by a crowd later on.
- If the market's question contains date, but without an year, it's okay.
- If the market's question contains year, but without an exact date, it's okay.
- The market's question can not be about itself or refer to itself.
- The answer is probably Google-able, after the event happened.
- The potential asnwer can be only "Yes" or "No".
Follow a chain of thought to evaluate if the question is fully qualified:
First, write the parts of the following question:
"{question}"
Then, write down what is the future event of the question, what it refers to and when that event will happen if the question contains it.
Then, explain why do you think it is or isn't fully qualified.
Finally, write your final decision, write `decision: ` followed by either "yes it is fully qualified" or "no it isn't fully qualified" about the question. Don't write anything else after that. You must include "yes" or "no".
"""
completion = str(llm.invoke(prompt, max_tokens=512).content)

try:
decision = completion.lower().rsplit("decision", 1)[1]
except IndexError as e:
raise ValueError(
f"Invalid completion in is_predictable for `{question}`: {completion}"
) from e

if "yes" in decision:
is_predictable = True
elif "no" in decision:
is_predictable = False
else:
raise ValueError(
f"Invalid completion in is_predictable for `{question}`: {completion}"
)

return is_predictable


def run(
market_question: str,
openai_api_key: SecretStr | None = None,
serper_api_key: SecretStr | None = None,
) -> bool | None:
"""
Run the prediction market resolver based on Open Fact Verifier.
Returns:
- None if can't decide
- True if the answer for the question is "Yes"
- False if the answer for the question is "No"
"""
# Check if the question is reasonable to look for an answer.
is_answerable = is_predictable_binary(
market_question, openai_api_key=openai_api_key
)
if not is_answerable:
print(
f"Question `{market_question}` is not answerable, skipping fact checking."
)
return None

# Rewrite the question (which was about a future) into a sentence (which is about the past).
market_sentence = rewrite_as_sentence(
market_question, openai_api_key=openai_api_key
)
print(f"Question `{market_question}` rewritten into `{market_sentence}`.")
# Fact-check the sentence.
factresult = factcheck(
market_sentence, openai_api_key=openai_api_key, serper_api_key=serper_api_key
)
print(
f"Fact check result for `{market_sentence}` is `{factresult.factuality}`, because {factresult.claims_details}."
)

return factresult.factuality
Loading

0 comments on commit 2cff00a

Please sign in to comment.