From 085ee43909580d4f8dcbdd4815b2e4d3e278c617 Mon Sep 17 00:00:00 2001 From: David Helbig <52451401+davidhelbig@users.noreply.github.com> Date: Tue, 6 Aug 2024 22:49:56 +0200 Subject: [PATCH 1/4] Add TokenBuilder, documentation --- src/lib.rs | 99 ++++++++++++++++++++++++++++++++++++++++++++++++-- src/parsers.rs | 37 +++++++++++++++++-- 2 files changed, 129 insertions(+), 7 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 240b1fb..52c1184 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -38,7 +38,7 @@ pub use crate::parsers::{parse_file, parse_sentence, parse_token}; pub struct Feature<'a>(pub &'a str, pub &'a str); -#[derive(Debug)] +#[derive(Debug, PartialEq, Eq)] pub struct ParseUposError; impl fmt::Display for ParseUposError { @@ -109,7 +109,7 @@ pub enum TokenID { type Features = HashMap; -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Eq)] pub struct Token { pub id: TokenID, pub form: String, @@ -123,13 +123,104 @@ pub struct Token { pub misc: Option, } -#[derive(Debug, Clone, PartialEq)] +impl Token { + pub fn builder(id: TokenID, form: String) -> TokenBuilder { + TokenBuilder::new(id, form) + } +} + +pub struct TokenBuilder { + id: TokenID, + form: String, + lemma: Option, + upos: Option, + xpos: Option, + features: Option, + head: Option, + deprel: Option, + dep: Option>, + misc: Option, +} + +impl TokenBuilder { + pub fn new(id: TokenID, form: String) -> TokenBuilder { + TokenBuilder { + id, + form, + lemma: None, + upos: None, + xpos: None, + features: None, + head: None, + deprel: None, + dep: None, + misc: None, + } + } + + pub fn lemma(mut self, lemma: String) -> TokenBuilder { + self.lemma = Some(lemma); + self + } + + pub fn upos(mut self, upos: UPOS) -> TokenBuilder { + self.upos = Some(upos); + self + } + + pub fn xpos(mut self, xpos: String) -> TokenBuilder { + self.xpos = Some(xpos); + self + } + + pub fn features(mut self, features: Features) -> TokenBuilder { + self.features = Some(features); + self + } + + pub fn head(mut self, head: TokenID) -> TokenBuilder { + self.head = Some(head); + self + } + + pub fn deprel(mut self, deprel: String) -> TokenBuilder { + self.deprel = Some(deprel); + self + } + + pub fn dep(mut self, dep: Vec) -> TokenBuilder { + self.dep = Some(dep); + self + } + + pub fn misc(mut self, misc: String) -> TokenBuilder { + self.misc = Some(misc); + self + } + + pub fn build(self) -> Token { + Token { + id: self.id, + form: self.form, + lemma: self.lemma, + upos: self.upos, + xpos: self.xpos, + features: self.features, + head: self.head, + deprel: self.deprel, + dep: self.dep, + misc: self.misc, + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] pub struct Dep { pub head: TokenID, pub rel: String, } -#[derive(Debug, Clone)] +#[derive(Debug, Clone, PartialEq, Eq)] pub struct Sentence { pub meta: Vec, pub tokens: Vec, diff --git a/src/parsers.rs b/src/parsers.rs index e8786bc..3c0c3d6 100644 --- a/src/parsers.rs +++ b/src/parsers.rs @@ -9,7 +9,7 @@ use std::{ }; use thiserror::Error; -#[derive(Error, PartialEq, Debug)] +#[derive(Error, PartialEq, Debug, Eq)] pub enum ParseIdError { #[error("Range must be two integers separated by -")] InvalidRange, @@ -20,7 +20,7 @@ pub enum ParseIdError { }, } -#[derive(Error, Debug)] +#[derive(Error, Debug, PartialEq, Eq)] pub enum ParseErrorType { #[error("Missing field: {0}")] MissingField(&'static str), @@ -35,7 +35,7 @@ pub enum ParseErrorType { KeyValueParseError, } -#[derive(Error, Debug)] +#[derive(Error, Debug, PartialEq, Eq)] #[error("Parse error in line {line}: {err}")] pub struct ConlluParseError { line: usize, @@ -253,6 +253,37 @@ pub fn parse_sentence(input: &str) -> Result { Ok(Sentence { meta, tokens }) } +/// A `Doc` is a wrapper around a type that implements [BufRead] and produces +/// lines in ConLL-U format that can be parsed into sentences, which +/// can be accessed via iteration. +/// +/// For the common use case of parsing a file in CoNLL-U format, +/// this crate provides the convenience function [parse_file], which produces a `Doc>`. +/// +/// ```rust +/// use std::io::BufReader; +/// use rs_conllu::{Sentence, Token, TokenID}; +/// use rs_conllu::parsers::Doc; +/// +/// let conllu = "1\tSue\t_\t_\t_\t_\t_\t_\t_\t_ +/// 2\tlikes\t_\t_\t_\t_\t_\t_\t_\t_ +/// 3\tcoffee\t_\t_\t_\t_\t_\t_\t_\t_ +/// ".as_bytes(); +/// +/// let reader = BufReader::new(conllu); +/// +/// let mut doc = Doc::new(reader); +/// +/// assert_eq!(doc.next(), Some(Ok(Sentence { +/// meta: vec![], +/// tokens: vec![ +/// Token::builder(TokenID::Single(1), "Sue".to_string()).build(), +/// Token::builder(TokenID::Single(2), "likes".to_string()).build(), +/// Token::builder(TokenID::Single(3), "coffee".to_string()).build(), +/// ] +/// }))); +/// ``` +/// pub struct Doc { reader: T, line_num: usize, From 7edd54b75d9f47b5ff4391dce8e390a47eea9bc5 Mon Sep 17 00:00:00 2001 From: David Helbig <52451401+davidhelbig@users.noreply.github.com> Date: Wed, 7 Aug 2024 19:16:48 +0200 Subject: [PATCH 2/4] Restructuring of repository, more documentation --- src/lib.rs | 127 ++------------------------------ src/parsers.rs | 20 +++--- src/token.rs | 151 +++++++++++++++++++++++++++++++++++++++ tests/file_parse_test.rs | 2 +- 4 files changed, 166 insertions(+), 134 deletions(-) create mode 100644 src/token.rs diff --git a/src/lib.rs b/src/lib.rs index 52c1184..c9c6657 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -29,12 +29,15 @@ #![allow(clippy::tabs_in_doc_comments)] -use std::{collections::HashMap, error::Error, fmt, str::FromStr}; +use std::{error::Error, fmt, str::FromStr}; pub mod cli; pub mod parsers; +pub mod token; -pub use crate::parsers::{parse_file, parse_sentence, parse_token}; +pub use token::{Dep, Token, TokenID}; + +pub use parsers::{parse_file, parse_sentence, parse_token}; pub struct Feature<'a>(pub &'a str, pub &'a str); @@ -100,126 +103,6 @@ impl FromStr for UPOS { } } -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum TokenID { - Single(usize), - Range(usize, usize), - Subordinate { major: usize, minor: usize }, -} - -type Features = HashMap; - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct Token { - pub id: TokenID, - pub form: String, - pub lemma: Option, - pub upos: Option, - pub xpos: Option, - pub features: Option, - pub head: Option, - pub deprel: Option, - pub dep: Option>, - pub misc: Option, -} - -impl Token { - pub fn builder(id: TokenID, form: String) -> TokenBuilder { - TokenBuilder::new(id, form) - } -} - -pub struct TokenBuilder { - id: TokenID, - form: String, - lemma: Option, - upos: Option, - xpos: Option, - features: Option, - head: Option, - deprel: Option, - dep: Option>, - misc: Option, -} - -impl TokenBuilder { - pub fn new(id: TokenID, form: String) -> TokenBuilder { - TokenBuilder { - id, - form, - lemma: None, - upos: None, - xpos: None, - features: None, - head: None, - deprel: None, - dep: None, - misc: None, - } - } - - pub fn lemma(mut self, lemma: String) -> TokenBuilder { - self.lemma = Some(lemma); - self - } - - pub fn upos(mut self, upos: UPOS) -> TokenBuilder { - self.upos = Some(upos); - self - } - - pub fn xpos(mut self, xpos: String) -> TokenBuilder { - self.xpos = Some(xpos); - self - } - - pub fn features(mut self, features: Features) -> TokenBuilder { - self.features = Some(features); - self - } - - pub fn head(mut self, head: TokenID) -> TokenBuilder { - self.head = Some(head); - self - } - - pub fn deprel(mut self, deprel: String) -> TokenBuilder { - self.deprel = Some(deprel); - self - } - - pub fn dep(mut self, dep: Vec) -> TokenBuilder { - self.dep = Some(dep); - self - } - - pub fn misc(mut self, misc: String) -> TokenBuilder { - self.misc = Some(misc); - self - } - - pub fn build(self) -> Token { - Token { - id: self.id, - form: self.form, - lemma: self.lemma, - upos: self.upos, - xpos: self.xpos, - features: self.features, - head: self.head, - deprel: self.deprel, - dep: self.dep, - misc: self.misc, - } - } -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct Dep { - pub head: TokenID, - pub rel: String, -} - #[derive(Debug, Clone, PartialEq, Eq)] pub struct Sentence { pub meta: Vec, diff --git a/src/parsers.rs b/src/parsers.rs index 3c0c3d6..3ff77d2 100644 --- a/src/parsers.rs +++ b/src/parsers.rs @@ -1,4 +1,3 @@ -use crate::{Dep, ParseUposError, Sentence, Token, TokenID, UPOS}; use std::{ collections::HashMap, fs::File, @@ -9,6 +8,11 @@ use std::{ }; use thiserror::Error; +use crate::{ + token::{Dep, Token, TokenID}, + ParseUposError, Sentence, UPOS, +}; + #[derive(Error, PartialEq, Debug, Eq)] pub enum ParseIdError { #[error("Range must be two integers separated by -")] @@ -172,10 +176,7 @@ fn parse_id(field: &str) -> Result { return match sep { '-' => Ok(TokenID::Range(ids[0], ids[1])), - '.' => Ok(TokenID::Subordinate { - major: ids[0], - minor: ids[1], - }), + '.' => Ok(TokenID::Empty(ids[0], ids[1])), _ => panic!(), }; } @@ -346,7 +347,7 @@ impl Iterator for Doc { mod test { use std::collections::HashMap; - use crate::{Token, TokenID, UPOS}; + use crate::{Token, UPOS}; use super::*; @@ -361,11 +362,8 @@ mod test { } #[test] - fn can_parse_id_subordinate() { - assert_eq!( - parse_id("5.6"), - Ok(TokenID::Subordinate { major: 5, minor: 6 }) - ); + fn can_parse_id_empty() { + assert_eq!(parse_id("5.6"), Ok(TokenID::Empty(5, 6))); } #[test] diff --git a/src/token.rs b/src/token.rs new file mode 100644 index 0000000..9c8d36f --- /dev/null +++ b/src/token.rs @@ -0,0 +1,151 @@ +use std::collections::HashMap; + +use crate::UPOS; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum TokenID { + /// The standard, single index. + Single(usize), + /// A range of tokens that form an ID. Denoted by a hyphen + /// in CoNLL-U format (e.g. 1-3). + Range(usize, usize), + /// To represent ellipses, ConLL-U allows to create sub-indices of the preceding + /// regular node (or 0 if it is a the beginning of a sentence). They are separated + /// by a decimal point and represent an "empty" node. + Empty(usize, usize), +} + +type Features = HashMap; + +/// A `Token` is the basic unit of what is defined on a (non-comment) line in CoNLL-U format. +/// The ConLL-U specification uses the terms "word", "node" and "multi-word token" while this crate +/// decided to use the general notion of "Token" to subsume all of the above. +/// +/// The fields of a `Token` are the ten fields that are defined in the CoNLL-U specification. +/// The only mandatory fields are [Token::id] and [Token::form]. The remaining ones are optional (absence denoted +/// by an underscore in the text format) and represented as [Option] types. +/// +/// A [TokenBuilder] type is available for more convenient creation of [Token] structs. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Token { + pub id: TokenID, + pub form: String, + pub lemma: Option, + pub upos: Option, + pub xpos: Option, + pub features: Option, + pub head: Option, + pub deprel: Option, + pub dep: Option>, + pub misc: Option, +} + +impl Token { + /// Return a new [TokenBuilder] + pub fn builder(id: TokenID, form: String) -> TokenBuilder { + TokenBuilder::new(id, form) + } +} + +/// A builder for Tokens to allow for more convenient manual creation if necessary. +/// +/// ```rust +/// use rs_conllu::{Token, TokenID}; +/// +/// // Get a new builder from Token +/// let token = Token::builder(TokenID::Single(1), "Hello".to_string()) +/// .lemma("Hello".to_string()) +/// .build(); +/// +/// ``` +pub struct TokenBuilder { + id: TokenID, + form: String, + lemma: Option, + upos: Option, + xpos: Option, + features: Option, + head: Option, + deprel: Option, + dep: Option>, + misc: Option, +} + +impl TokenBuilder { + pub fn new(id: TokenID, form: String) -> TokenBuilder { + TokenBuilder { + id, + form, + lemma: None, + upos: None, + xpos: None, + features: None, + head: None, + deprel: None, + dep: None, + misc: None, + } + } + + /// Set the lemma of the token. + pub fn lemma(mut self, lemma: String) -> TokenBuilder { + self.lemma = Some(lemma); + self + } + + pub fn upos(mut self, upos: UPOS) -> TokenBuilder { + self.upos = Some(upos); + self + } + + pub fn xpos(mut self, xpos: String) -> TokenBuilder { + self.xpos = Some(xpos); + self + } + + pub fn features(mut self, features: Features) -> TokenBuilder { + self.features = Some(features); + self + } + + pub fn head(mut self, head: TokenID) -> TokenBuilder { + self.head = Some(head); + self + } + + pub fn deprel(mut self, deprel: String) -> TokenBuilder { + self.deprel = Some(deprel); + self + } + + pub fn dep(mut self, dep: Vec) -> TokenBuilder { + self.dep = Some(dep); + self + } + + pub fn misc(mut self, misc: String) -> TokenBuilder { + self.misc = Some(misc); + self + } + + pub fn build(self) -> Token { + Token { + id: self.id, + form: self.form, + lemma: self.lemma, + upos: self.upos, + xpos: self.xpos, + features: self.features, + head: self.head, + deprel: self.deprel, + dep: self.dep, + misc: self.misc, + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Dep { + pub head: TokenID, + pub rel: String, +} diff --git a/tests/file_parse_test.rs b/tests/file_parse_test.rs index b58b525..1abd220 100644 --- a/tests/file_parse_test.rs +++ b/tests/file_parse_test.rs @@ -1,6 +1,6 @@ use std::{collections::HashMap, fs::File}; -use rs_conllu::{parse_file, Dep, Token, TokenID, UPOS}; +use rs_conllu::{parse_file, token::Dep, token::Token, token::TokenID, UPOS}; #[test] fn test_file_parse() { From 0c4305d1c23c25c1d328242ee513a78fd05d78b7 Mon Sep 17 00:00:00 2001 From: David Helbig <52451401+davidhelbig@users.noreply.github.com> Date: Thu, 8 Aug 2024 18:55:30 +0200 Subject: [PATCH 3/4] Rename dep -> deps, more documentation --- src/parsers.rs | 10 +++++----- src/token.rs | 34 ++++++++++++++++++++++++++-------- tests/file_parse_test.rs | 2 +- 3 files changed, 32 insertions(+), 14 deletions(-) diff --git a/src/parsers.rs b/src/parsers.rs index 3ff77d2..ff02fed 100644 --- a/src/parsers.rs +++ b/src/parsers.rs @@ -73,7 +73,7 @@ pub fn parse_file(file: File) -> Doc> { /// features: None, /// head: Some(TokenID::Single(3)), /// deprel: Some("nmod".to_string()), -/// dep: None, +/// deps: None, /// misc: None /// }); /// ``` @@ -125,10 +125,10 @@ pub fn parse_token(line: &str) -> Result { .ok_or(ParseErrorType::MissingField("deprel"))?; let deprel = placeholder(deprel).map(String::from); - let dep = fields_iter + let deps = fields_iter .next() .ok_or(ParseErrorType::MissingField("deps"))?; - let dep = placeholder_result(dep, parse_deps).transpose()?; + let deps = placeholder_result(deps, parse_deps).transpose()?; let misc = fields_iter .next() @@ -144,7 +144,7 @@ pub fn parse_token(line: &str) -> Result { features, head, deprel, - dep, + deps, misc, }) } @@ -387,7 +387,7 @@ mod test { features: Some(features), head: Some(TokenID::Single(3)), deprel: Some("det".to_string()), - dep: None, + deps: None, misc: None, }; diff --git a/src/token.rs b/src/token.rs index 9c8d36f..109ca44 100644 --- a/src/token.rs +++ b/src/token.rs @@ -28,20 +28,30 @@ type Features = HashMap; /// A [TokenBuilder] type is available for more convenient creation of [Token] structs. #[derive(Debug, Clone, PartialEq, Eq)] pub struct Token { + /// The id of the token within the sentence. pub id: TokenID, + /// The surface form of the token as it appears in the sentence. pub form: String, + /// The lemma or lexical form of the token. pub lemma: Option, + /// The universal POS tag of the token. pub upos: Option, + /// Language-specific POS tag for the token. pub xpos: Option, + /// Morphological features of the token as key-value pairs. pub features: Option, + /// The head of the current token. pub head: Option, + /// The dependency relation fo the token. pub deprel: Option, - pub dep: Option>, + /// Enhanced dependency graph information. + pub deps: Option>, + /// Other types of annotation. pub misc: Option, } impl Token { - /// Return a new [TokenBuilder] + /// Return a new [TokenBuilder]. pub fn builder(id: TokenID, form: String) -> TokenBuilder { TokenBuilder::new(id, form) } @@ -67,7 +77,7 @@ pub struct TokenBuilder { features: Option, head: Option, deprel: Option, - dep: Option>, + deps: Option>, misc: Option, } @@ -82,52 +92,60 @@ impl TokenBuilder { features: None, head: None, deprel: None, - dep: None, + deps: None, misc: None, } } - /// Set the lemma of the token. + /// Set the lemma field. pub fn lemma(mut self, lemma: String) -> TokenBuilder { self.lemma = Some(lemma); self } + /// Set the universal POS tag field. pub fn upos(mut self, upos: UPOS) -> TokenBuilder { self.upos = Some(upos); self } + /// Set the xpos field. pub fn xpos(mut self, xpos: String) -> TokenBuilder { self.xpos = Some(xpos); self } + /// Set the features field. pub fn features(mut self, features: Features) -> TokenBuilder { self.features = Some(features); self } + /// Set the head field. pub fn head(mut self, head: TokenID) -> TokenBuilder { self.head = Some(head); self } + /// Set the deprel field. pub fn deprel(mut self, deprel: String) -> TokenBuilder { self.deprel = Some(deprel); self } - pub fn dep(mut self, dep: Vec) -> TokenBuilder { - self.dep = Some(dep); + /// Set the deps field. + pub fn deps(mut self, dep: Vec) -> TokenBuilder { + self.deps = Some(dep); self } + /// Set the misc field. pub fn misc(mut self, misc: String) -> TokenBuilder { self.misc = Some(misc); self } + /// Build the token. pub fn build(self) -> Token { Token { id: self.id, @@ -138,7 +156,7 @@ impl TokenBuilder { features: self.features, head: self.head, deprel: self.deprel, - dep: self.dep, + deps: self.deps, misc: self.misc, } } diff --git a/tests/file_parse_test.rs b/tests/file_parse_test.rs index 1abd220..cdcc891 100644 --- a/tests/file_parse_test.rs +++ b/tests/file_parse_test.rs @@ -26,7 +26,7 @@ fn test_file_parse() { ])), head: Some(TokenID::Single(2)), deprel: Some("nsubj".to_string()), - dep: Some(vec![ + deps: Some(vec![ Dep { head: TokenID::Single(2), rel: "nsubj".to_string() From 197e8503efa9dba6886d7708202e80ffdf6a061e Mon Sep 17 00:00:00 2001 From: David Helbig <52451401+davidhelbig@users.noreply.github.com> Date: Sat, 10 Aug 2024 20:18:09 +0200 Subject: [PATCH 4/4] Minor doc changes --- src/token.rs | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/token.rs b/src/token.rs index 109ca44..9b70fd8 100644 --- a/src/token.rs +++ b/src/token.rs @@ -18,14 +18,16 @@ pub enum TokenID { type Features = HashMap; /// A `Token` is the basic unit of what is defined on a (non-comment) line in CoNLL-U format. -/// The ConLL-U specification uses the terms "word", "node" and "multi-word token" while this crate -/// decided to use the general notion of "Token" to subsume all of the above. +/// The ConLL-U specification uses the terms _word_, _node_ and _multi-word token_ while this crate +/// decided to use the general notion of _Token_ to subsume all of the above. /// /// The fields of a `Token` are the ten fields that are defined in the CoNLL-U specification. -/// The only mandatory fields are [Token::id] and [Token::form]. The remaining ones are optional (absence denoted +/// The only mandatory fields are [id](Token::id) and [form](Token::form). The remaining ones are optional (absence denoted /// by an underscore in the text format) and represented as [Option] types. /// -/// A [TokenBuilder] type is available for more convenient creation of [Token] structs. +/// A [TokenBuilder] type is available for more convenient creation of [Token] structs, +/// which can be instantiated via the [builder](Token::builder) method. +/// #[derive(Debug, Clone, PartialEq, Eq)] pub struct Token { /// The id of the token within the sentence. @@ -82,6 +84,8 @@ pub struct TokenBuilder { } impl TokenBuilder { + /// Contstructor for [TokenBuilder]. Both `id` and `form` are mandatory + /// fields and thus required when instantiating. pub fn new(id: TokenID, form: String) -> TokenBuilder { TokenBuilder { id, @@ -164,6 +168,8 @@ impl TokenBuilder { #[derive(Debug, Clone, PartialEq, Eq)] pub struct Dep { + /// The head of the relation. pub head: TokenID, + /// The type of the relation. pub rel: String, }