From 78ae3549e01bc98eb33d3d895971c74824caffed Mon Sep 17 00:00:00 2001 From: Yomguithereal Date: Mon, 25 Nov 2024 16:53:50 +0100 Subject: [PATCH] Adding significance level for vocab cooc Related to #383 --- src/cmd/vocab.rs | 77 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 56 insertions(+), 21 deletions(-) diff --git a/src/cmd/vocab.rs b/src/cmd/vocab.rs index 12c9fdb..ee87f5c 100644 --- a/src/cmd/vocab.rs +++ b/src/cmd/vocab.rs @@ -20,16 +20,16 @@ use crate::util; use crate::CliError; use crate::CliResult; -#[derive(Clone, Copy)] -struct Chi2SignificanceLevel(f64); +#[derive(Clone, Copy, Debug)] +struct SignificanceLevel(f64); -impl Chi2SignificanceLevel { +impl SignificanceLevel { fn get(&self) -> f64 { self.0 } } -impl<'de> Deserialize<'de> for Chi2SignificanceLevel { +impl<'de> Deserialize<'de> for SignificanceLevel { fn deserialize>(d: D) -> Result { let raw = String::deserialize(d)?; @@ -131,24 +131,32 @@ vocab options: vocab doc-token options: --tf-weight TF weighting scheme. One of \"count\", \"binary\", \"ratio\", or \"log-normal\". [default: count] - --k1-value \"k1\" Factor for BM25 computation. [default: 1.2] - --b-value \"b\" Factor for BM25 computation. [default: 0.75] + --k1-value \"k1\" Factor for BM25 computation. [default: 1.2] + --b-value \"b\" Factor for BM25 computation. [default: 0.75] --chi2-significance Filter doc,token pairs by only keeping significant ones wrt their chi2 score that must be above the given significance level. Accepted levels include \"0.5\", \"0.1\", \"0.05\", \"0.025\", \"0.01\", \"0.005\" and \"0.001\". vocab cooc options: - -w, --window Size of the co-occurrence window, in number of tokens around the currently - considered token. If not given, co-occurrences will be computed using the bag of - words model where tokens are considered to co-occur with every - other one in the same document. - Set the window to \"1\" to compute bigram collocations. Set a larger window - to get something similar to what word2vec would consider. - -F, --forward Whether to only consider a forward window when traversing token contexts. - --distrib Compute directed distributional similarity metrics instead. - --min-count Minimum number of co-occurrence count to be included in the result. - [default: 1] + -w, --window Size of the co-occurrence window, in number of tokens around the currently + considered token. If not given, co-occurrences will be computed using the bag + of words model where tokens are considered to co-occur with every + other one in the same document. + Set the window to \"1\" to compute bigram collocations. Set a larger window + to get something similar to what word2vec would consider. + -F, --forward Whether to only consider a forward window when traversing token contexts. + --distrib Compute directed distributional similarity metrics instead. + --min-count Minimum number of co-occurrence count to be included in the result. + [default: 1] + --chi2-significance Filter doc,token pairs by only keeping significant ones wrt their + chi2 score that must be above the given significance level. Accepted + levels include \"0.5\", \"0.1\", \"0.05\", \"0.025\", \"0.01\", + \"0.005\" and \"0.001\". + --G2-significance Filter doc,token pairs by only keeping significant ones wrt their + G2 score that must be above the given significance level. Accepted + levels include \"0.5\", \"0.1\", \"0.05\", \"0.025\", \"0.01\", + \"0.005\" and \"0.001\". Common options: -h, --help Display this message @@ -176,7 +184,9 @@ struct Args { flag_tf_weight: TfWeighting, flag_k1_value: f64, flag_b_value: f64, - flag_chi2_significance: Option, + flag_chi2_significance: Option, + #[serde(rename = "flag_G2_significance")] + flag_g2_significance: Option, flag_window: Option, flag_forward: bool, flag_distrib: bool, @@ -212,6 +222,9 @@ pub fn run(argv: &[&str]) -> CliResult<()> { )); } + let chi2_significance = args.flag_chi2_significance.map(|s| s.get()); + let g2_significance = args.flag_g2_significance.map(|s| s.get()); + let rconf = Config::new(&args.arg_input) .delimiter(args.flag_delimiter) .no_headers(args.flag_no_headers); @@ -390,8 +403,12 @@ pub fn run(argv: &[&str]) -> CliResult<()> { ]; wtr.write_record(output_headers)?; - cooccurrences - .for_each_cooc_record(args.flag_min_count, |r| wtr.write_byte_record(r))?; + cooccurrences.for_each_cooc_record( + args.flag_min_count, + chi2_significance, + g2_significance, + |r| wtr.write_byte_record(r), + )?; } return Ok(wtr.flush()?); @@ -452,7 +469,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { args.flag_k1_value, args.flag_b_value, args.flag_tf_weight, - args.flag_chi2_significance.map(|s| s.get()), + chi2_significance, |r| wtr.write_byte_record(r), )?; } else if args.cmd_doc { @@ -1031,7 +1048,13 @@ impl Cooccurrences { target_entry.gcf += 1; } - fn for_each_cooc_record(self, min_count: usize, mut callback: F) -> Result<(), E> + fn for_each_cooc_record( + self, + min_count: usize, + chi2_significance: Option, + g2_significance: Option, + mut callback: F, + ) -> Result<(), E> where F: FnMut(&csv::ByteRecord) -> Result<(), E>, { @@ -1054,6 +1077,18 @@ impl Cooccurrences { // chi2/G2 computations let (chi2, g2) = compute_chi2_and_g2(x, y, xy, n); + if let Some(level) = chi2_significance { + if chi2 < level { + continue; + } + } + + if let Some(level) = g2_significance { + if g2 < level { + continue; + } + } + // PMI-related computations let pmi = compute_pmi(x, y, xy, n); let npmi = compute_npmi(xy, n, pmi);