Skip to content

Commit

Permalink
jaccard test also needed comparison update... delete commented out C+…
Browse files Browse the repository at this point in the history
…+ code
  • Loading branch information
ChuckHastings committed Nov 13, 2024
1 parent f00d4cf commit eed6afd
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 5 deletions.
1 change: 0 additions & 1 deletion cpp/src/link_prediction/similarity_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,6 @@ all_pairs_similarity(raft::handle_t const& handle,
// computing/updating topk with each batch

// FIXME: Experiment with this and adjust as necessary
// size_t const MAX_PAIRS_PER_BATCH{100};
size_t const MAX_PAIRS_PER_BATCH{
static_cast<size_t>(handle.get_device_properties().multiProcessorCount) * (1 << 15)};

Expand Down
86 changes: 83 additions & 3 deletions python/cugraph/cugraph/tests/link_prediction/test_jaccard.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

import pytest
import networkx as nx
import pandas as pd

import cudf
import cugraph
Expand Down Expand Up @@ -153,6 +154,54 @@ def networkx_call(M, benchmark_callable=None):
return src, dst, coeff


# FIXME: This compare is shared across several tests... it should be
# a general utility
def compare(src1, dst1, val1, src2, dst2, val2):
#
# We will do comparison computations by using dataframe
# merge functions (essentially doing fast joins). We
# start by making two data frames
#
df1 = cudf.DataFrame()
df1["src1"] = src1
df1["dst1"] = dst1
if val1 is not None:
df1["val1"] = val1

df2 = cudf.DataFrame()
df2["src2"] = src2
df2["dst2"] = dst2
if val2 is not None:
df2["val2"] = val2

#
# Check to see if all pairs in the original data frame
# still exist in the new data frame. If we join (merge)
# the data frames where (src1[i]=src2[i]) and (dst1[i]=dst2[i])
# then we should get exactly the same number of entries in
# the data frame if we did not lose any data.
#
join = df1.merge(df2, left_on=["src1", "dst1"], right_on=["src2", "dst2"])

if len(df1) != len(join):
join2 = df1.merge(
df2, how="left", left_on=["src1", "dst1"], right_on=["src2", "dst2"]
)
pd.set_option("display.max_rows", 500)
print("df1 = \n", df1.sort_values(["src1", "dst1"]))
print("df2 = \n", df2.sort_values(["src2", "dst2"]))
print(
"join2 = \n",
join2.sort_values(["src1", "dst1"])
.to_pandas()
.query("src2.isnull()", engine="python"),
)

assert len(df1) == len(join)

assert_series_equal(join["val1"], join["val2"], check_names=False)


# =============================================================================
# Pytest Fixtures
# =============================================================================
Expand Down Expand Up @@ -415,7 +464,7 @@ def test_all_pairs_jaccard_with_topk():
jaccard_results = (
jaccard_results[jaccard_results["first"] != jaccard_results["second"]]
.sort_values(["jaccard_coeff", "first", "second"], ascending=False)
.reset_index(drop=True)[:topk]
.reset_index(drop=True)
)

# Call all-pairs Jaccard
Expand All @@ -425,6 +474,37 @@ def test_all_pairs_jaccard_with_topk():
.reset_index(drop=True)
)

assert_frame_equal(
jaccard_results, all_pairs_jaccard_results, check_dtype=False, check_like=True
# 1. All pair similarity might return different top pairs k pairs
# which are still valid hence, ensure the pairs returned by all-pairs
# exists, and that any results better than the k-th result are included
# in the result

# FIXME: This problem could exist in overlap, cosine and sorensen,
# consider replicating this code or making a share comparison
# function
worst_coeff = all_pairs_jaccard_results["jaccard_coeff"].min()
better_than_k = jaccard_results[jaccard_results["jaccard_coeff"] > worst_coeff]

compare(
all_pairs_jaccard_results["first"],
all_pairs_jaccard_results["second"],
all_pairs_jaccard_results["jaccard_coeff"],
jaccard_results["first"],
jaccard_results["second"],
jaccard_results["jaccard_coeff"],
)

compare(
better_than_k["first"],
better_than_k["second"],
better_than_k["jaccard_coeff"],
all_pairs_jaccard_results["first"],
all_pairs_jaccard_results["second"],
all_pairs_jaccard_results["jaccard_coeff"],
)

# 2. Ensure the coefficient scores are still the highest
assert_series_equal(
all_pairs_jaccard_results["jaccard_coeff"],
jaccard_results["jaccard_coeff"][:topk],
)
Original file line number Diff line number Diff line change
Expand Up @@ -475,7 +475,7 @@ def test_all_pairs_sorensen_with_topk():
# exists, and that any results better than the k-th result are included
# in the result

# FIXME: This problem could exist in overlap, cosine and sorensen,
# FIXME: This problem could exist in overlap, cosine and jaccard,
# consider replicating this code or making a share comparison
# function
worst_coeff = all_pairs_sorensen_results["sorensen_coeff"].min()
Expand Down

0 comments on commit eed6afd

Please sign in to comment.