diff --git a/src/cmd/vocab.rs b/src/cmd/vocab.rs index 52d5e06..b5f4ade 100644 --- a/src/cmd/vocab.rs +++ b/src/cmd/vocab.rs @@ -94,7 +94,7 @@ This command can compute 5 kinds of differents vocabulary statistics: 5. token-cooccurrence-level statistics (using the \"cooc\" subcommand): - token1: the first token - token2: the second token - - count: total number of co-occurrences + - count: number of co-occurrences - chi2: chi2 score (approx. without the --complete flag) - G2: G2 score (approx. without the --complete flag) - pmi: pointwise mutual information @@ -104,10 +104,16 @@ This command can compute 5 kinds of differents vocabulary statistics: - token1: the first token - token2: the second token - - count: total number of co-occurrences + - count: number of co-occurrences - sd_I: distributional score based on PMI - sd_G2: distributional score based on G2 + or, using the --specificity flag: + + - token: the token + - count: total number of co-occurrences + - lgl: the specificity score (ratio of statistically relevant co-occurrences) + Usage: xan vocab corpus [options] [] xan vocab token [options] [] @@ -147,6 +153,7 @@ vocab cooc options: to get something similar to what word2vec would consider. -F, --forward Whether to only consider a forward window when traversing token contexts. --distrib Compute directed distributional similarity metrics instead. + --specificity Compute the lgl specificity score per token instead. --min-count Minimum number of co-occurrence count to be included in the result. [default: 1] --chi2-significance Filter doc,token pairs by only keeping significant ones wrt their @@ -190,6 +197,7 @@ struct Args { flag_window: Option, flag_forward: bool, flag_distrib: bool, + flag_specificity: bool, flag_min_count: usize, flag_output: Option, flag_no_headers: bool, @@ -216,7 +224,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { )); } - if args.flag_distrib && args.flag_forward { + if (args.flag_distrib || args.flag_specificity) && args.flag_forward { return Err(CliError::Other( "-D, --distrib does not make sense with -F, --forward".to_string(), )); @@ -253,7 +261,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { let cooccurrence_mode = if args.flag_forward { CooccurrenceMode::Forward - } else if args.flag_distrib { + } else if args.flag_distrib || args.flag_specificity { CooccurrenceMode::Full } else { CooccurrenceMode::Symmetrical @@ -397,6 +405,15 @@ pub fn run(argv: &[&str]) -> CliResult<()> { wtr.write_record(output_headers)?; cooccurrences .for_each_distrib_cooc_record(args.flag_min_count, |r| wtr.write_byte_record(r))?; + } else if args.flag_specificity { + let output_headers: [&[u8]; 3] = [b"token", b"count", b"lgl"]; + + wtr.write_record(output_headers)?; + cooccurrences.for_each_specificity_record( + args.flag_min_count, + g2_significance, + |r| wtr.write_byte_record(r), + )?; } else { let output_headers: [&[u8]; 7] = [ b"token1", b"token2", b"count", b"chi2", b"G2", b"pmi", b"npmi", @@ -1355,4 +1372,50 @@ impl Cooccurrences { Ok(()) } + + fn for_each_specificity_record( + self, + min_count: usize, + g2_significance: Option, + mut callback: F, + ) -> Result<(), E> + where + F: FnMut(&csv::ByteRecord) -> Result<(), E>, + { + let mut csv_record = csv::ByteRecord::new(); + let n = self.cooccurrences_count; + + let g2_significance = g2_significance.unwrap_or(3.84); + + for source_entry in self.token_entries.iter() { + if source_entry.gcf < min_count { + continue; + } + + csv_record.clear(); + + let mut statistically_significant: usize = 0; + + for (target_id, source_target_count) in source_entry.cooc.iter() { + let target_entry = &self.token_entries[*target_id]; + + let g2 = compute_g2(source_entry.gcf, target_entry.gcf, *source_target_count, n); + + // 5% test + if g2 >= g2_significance { + statistically_significant += 1; + } + } + + let lgl = statistically_significant as f64 / n as f64; + + csv_record.push_field(&source_entry.token); + csv_record.push_field(source_entry.gcf.to_string().as_bytes()); + csv_record.push_field(lgl.to_string().as_bytes()); + + callback(&csv_record)?; + } + + Ok(()) + } }