-
Notifications
You must be signed in to change notification settings - Fork 34
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
2,492 additions
and
800 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -54,4 +54,6 @@ backup_mech/ | |
/packages/valory/skills/termination_abci/ | ||
/pip | ||
/tool_test.py | ||
.venv | ||
.venv | ||
log | ||
.benchmark-cache |
133 changes: 133 additions & 0 deletions
133
packages/kongzii/customs/ofv_market_resolver/benchmark.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
import typer | ||
import pandas as pd | ||
from packages.kongzii.customs.ofv_market_resolver.ofv_market_resolver import ( | ||
run as ofv_run, | ||
) | ||
from packages.napthaai.customs.resolve_market_reasoning.resolve_market_reasoning import ( | ||
Results, | ||
run as original_run, | ||
) | ||
from pydantic import SecretStr, ValidationError | ||
from joblib import Memory | ||
|
||
# File cache to not re-run the same questions. | ||
MEMORY = Memory(".benchmark-cache", verbose=0) | ||
APP = typer.Typer() | ||
|
||
ofv_run_cached = MEMORY.cache(ofv_run) | ||
|
||
|
||
@MEMORY.cache | ||
def run_original_resolver_cached( | ||
question: str, | ||
openai_api_key: SecretStr, | ||
google_api_key: SecretStr, | ||
google_engine_id: SecretStr, | ||
) -> bool | None: | ||
try: | ||
dump = original_run( | ||
api_keys={ | ||
"openai": openai_api_key.get_secret_value(), | ||
"google_api_key": google_api_key.get_secret_value(), | ||
"google_engine_id": google_engine_id.get_secret_value(), | ||
}, | ||
tool="resolve-market-reasoning-gpt-4", | ||
prompt=question, | ||
)[0] | ||
return Results.model_validate_json(dump).has_occurred | ||
except ValueError: | ||
return None | ||
|
||
|
||
@APP.command() | ||
def full( | ||
data_path: str, | ||
openai_api_key: str, | ||
serper_api_key: str, | ||
google_api_key: str, | ||
google_engine_id: str, | ||
) -> None: | ||
""" | ||
Will run the prediction market resolver on all provided data and compare the results. | ||
Expects a tsv file with columns: | ||
- question | ||
- resolution (YES/NO, as currently resolved on Omen) | ||
- my_resolution (YES/NO, as resolved manually by you, used as ground truth) | ||
Example command: | ||
``` | ||
python packages/kongzii/customs/ofv_market_resolver/benchmark.py full markets.tsv {openai api key} {serper api key} {google api key} {google engine id} | ||
``` | ||
""" | ||
df = pd.read_csv(data_path, sep="\t") | ||
|
||
# Run the resolution on all the data. | ||
df["ofv_resolution"] = df["question"].apply( | ||
lambda q: ofv_run_cached( | ||
q, | ||
openai_api_key=SecretStr(openai_api_key), | ||
serper_api_key=SecretStr(serper_api_key), | ||
) | ||
) | ||
df["new_original_resolution"] = df["question"].apply( | ||
lambda q: run_original_resolver_cached( | ||
q, | ||
openai_api_key=SecretStr(openai_api_key), | ||
google_api_key=SecretStr(google_api_key), | ||
google_engine_id=SecretStr(google_engine_id), | ||
) | ||
) | ||
# Normalise boolean to YES/NO/None. | ||
df["ofv_resolution"] = df["ofv_resolution"].apply( | ||
lambda r: "None" if r is None else "YES" if r else "NO" | ||
) | ||
df["new_original_resolution"] = df["new_original_resolution"].apply( | ||
lambda r: "None" if r is None else "YES" if r else "NO" | ||
) | ||
# Save all the predictions and separatelly these that are incorrect. | ||
df.to_csv("markets_resolved.tsv", sep="\t", index=False) | ||
df[df["ofv_resolution"] != df["my_resolution"]].to_csv( | ||
"markets_resolved_incorretly_by_ofv.tsv", sep="\t", index=False | ||
) | ||
|
||
# Calculate the accuracy. | ||
accuracy_current = sum(df["resolution"] == df["my_resolution"]) / len(df) | ||
accuracy_new_original = sum( | ||
df["new_original_resolution"] == df["my_resolution"] | ||
) / len(df) | ||
accuracy_ofv = sum(df["ofv_resolution"] == df["my_resolution"]) / len(df) | ||
print( | ||
f""" | ||
Current accuracy: {accuracy_current*100:.2f}% | ||
Original's new run accuracy: {accuracy_new_original * 100:.2f} | ||
OFV's accuracy: {accuracy_ofv*100:.2f}% | ||
""" | ||
) | ||
|
||
|
||
@APP.command() | ||
def single( | ||
question: str, | ||
openai_api_key: str, | ||
serper_api_key: str, | ||
) -> None: | ||
""" | ||
Will run the prediction market resolver and print the result on a single question. | ||
Example command: | ||
``` | ||
python packages/kongzii/customs/ofv_market_resolver/benchmark.py single "Will McDonald's successfully buy back all its Israeli restaurants by 12 April 2024?" {openai api key} {serper api key} | ||
``` | ||
""" | ||
ofv_run( | ||
question, | ||
openai_api_key=SecretStr(openai_api_key), | ||
serper_api_key=SecretStr(serper_api_key), | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
APP() |
178 changes: 178 additions & 0 deletions
178
packages/kongzii/customs/ofv_market_resolver/ofv_market_resolver.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,178 @@ | ||
from factcheck import FactCheck | ||
from factcheck.utils.multimodal import modal_normalization | ||
from langchain_openai import ChatOpenAI | ||
from typing import Annotated | ||
from pydantic import SecretStr, BaseModel, BeforeValidator | ||
|
||
DEFAULT_OPENAI_MODEL = "gpt-4-0125-preview" | ||
|
||
Factuality = Annotated[ | ||
bool | None, | ||
BeforeValidator(lambda v: None if v in ("Nothing to check.", "non-factual") else v), | ||
] | ||
|
||
|
||
class FactCheckClaimDetails(BaseModel): | ||
claim: str | ||
factuality: Factuality | ||
correction: str | None | ||
reference_url: str | ||
|
||
|
||
class FactCheckResult(BaseModel): | ||
factuality: Factuality | ||
claims_details: list[FactCheckClaimDetails] | None | ||
|
||
|
||
def factcheck( | ||
statement: str, | ||
model: str = DEFAULT_OPENAI_MODEL, | ||
openai_api_key: SecretStr | None = None, | ||
serper_api_key: SecretStr | None = None, | ||
) -> FactCheckResult: | ||
api_config = { | ||
"OPENAI_API_KEY": openai_api_key.get_secret_value(), | ||
"SERPER_API_KEY": serper_api_key.get_secret_value(), | ||
} | ||
factcheck = FactCheck( | ||
default_model=model, | ||
api_config=api_config, | ||
retriever="serper", | ||
num_seed_retries=5, | ||
) | ||
content = modal_normalization("string", statement) | ||
res = factcheck.check_response(content) | ||
|
||
return FactCheckResult.model_validate(res) | ||
|
||
|
||
def rewrite_as_sentence( | ||
question: str, | ||
model: str = DEFAULT_OPENAI_MODEL, | ||
openai_api_key: SecretStr | None = None, | ||
) -> str: | ||
""" | ||
Rewrites the question into a sentence, example: | ||
`Will former Trump Organization CFO Allen Weisselberg be sentenced to jail by 15 April 2024?` | ||
-> | ||
`Former Trump Organization CFO Allen Weisselberg was sentenced to jail by 15 April 2024.` | ||
""" | ||
llm = ChatOpenAI( | ||
model=model, temperature=0.0, api_key=openai_api_key.get_secret_value() | ||
) | ||
|
||
prompt = f""" | ||
Rewrite the question into a simple annoucment sentence stating a fact or prediction like it is already known. | ||
Make future tense into past tense. | ||
For future questions that ask if something will happen "by" some date, rewrite it to "before" that date or any time sooner. | ||
For future questions that ask if something will happen "on" some date, rewrite it to "on" that date. | ||
If the question is both "on" and "by" some date, rewrite it as "before or any time sooner than" that date. | ||
If the question is about exact date, keep it exact. | ||
If the question is about a date range, keep it a range. | ||
Always keep the same meaning. | ||
Never negate the sentence into opposite meaning of the question. | ||
Question: {question} | ||
Sentence: | ||
""" | ||
completion = str(llm.invoke(prompt, max_tokens=512).content) | ||
|
||
return completion | ||
|
||
|
||
# TODO: This could be imported from prediction-market-agent-tooling, but given the conflict in the langchain versions, | ||
# it would require changes in other mechs of this repository. | ||
def is_predictable_binary( | ||
question: str, | ||
model: str = DEFAULT_OPENAI_MODEL, | ||
openai_api_key: SecretStr | None = None, | ||
) -> str: | ||
""" | ||
Evaluate if the question is actually answerable. | ||
""" | ||
llm = ChatOpenAI( | ||
model=model, temperature=0.0, api_key=openai_api_key.get_secret_value() | ||
) | ||
|
||
prompt = f"""Main signs about a fully qualified question (sometimes referred to as a "market"): | ||
- The market's question needs to be specific, without use of pronouns. | ||
- The market's question needs to have a clear future event. | ||
- The market's question needs to have a clear time frame. | ||
- The event in the market's question doesn't have to be ultra-specific, it will be decided by a crowd later on. | ||
- If the market's question contains date, but without an year, it's okay. | ||
- If the market's question contains year, but without an exact date, it's okay. | ||
- The market's question can not be about itself or refer to itself. | ||
- The answer is probably Google-able, after the event happened. | ||
- The potential asnwer can be only "Yes" or "No". | ||
Follow a chain of thought to evaluate if the question is fully qualified: | ||
First, write the parts of the following question: | ||
"{question}" | ||
Then, write down what is the future event of the question, what it refers to and when that event will happen if the question contains it. | ||
Then, explain why do you think it is or isn't fully qualified. | ||
Finally, write your final decision, write `decision: ` followed by either "yes it is fully qualified" or "no it isn't fully qualified" about the question. Don't write anything else after that. You must include "yes" or "no". | ||
""" | ||
completion = str(llm.invoke(prompt, max_tokens=512).content) | ||
|
||
try: | ||
decision = completion.lower().rsplit("decision", 1)[1] | ||
except IndexError as e: | ||
raise ValueError( | ||
f"Invalid completion in is_predictable for `{question}`: {completion}" | ||
) from e | ||
|
||
if "yes" in decision: | ||
is_predictable = True | ||
elif "no" in decision: | ||
is_predictable = False | ||
else: | ||
raise ValueError( | ||
f"Invalid completion in is_predictable for `{question}`: {completion}" | ||
) | ||
|
||
return is_predictable | ||
|
||
|
||
def run( | ||
market_question: str, | ||
openai_api_key: SecretStr | None = None, | ||
serper_api_key: SecretStr | None = None, | ||
) -> bool | None: | ||
""" | ||
Run the prediction market resolver based on Open Fact Verifier. | ||
Returns: | ||
- None if can't decide | ||
- True if the answer for the question is "Yes" | ||
- False if the answer for the question is "No" | ||
""" | ||
# Check if the question is reasonable to look for an answer. | ||
is_answerable = is_predictable_binary( | ||
market_question, openai_api_key=openai_api_key | ||
) | ||
if not is_answerable: | ||
print( | ||
f"Question `{market_question}` is not answerable, skipping fact checking." | ||
) | ||
return None | ||
|
||
# Rewrite the question (which was about a future) into a sentence (which is about the past). | ||
market_sentence = rewrite_as_sentence( | ||
market_question, openai_api_key=openai_api_key | ||
) | ||
print(f"Question `{market_question}` rewritten into `{market_sentence}`.") | ||
# Fact-check the sentence. | ||
factresult = factcheck( | ||
market_sentence, openai_api_key=openai_api_key, serper_api_key=serper_api_key | ||
) | ||
print( | ||
f"Fact check result for `{market_sentence}` is `{factresult.factuality}`, because {factresult.claims_details}." | ||
) | ||
|
||
return factresult.factuality |
Oops, something went wrong.