diff --git a/src/lib.rs b/src/lib.rs index 240b1fb..c9c6657 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -29,16 +29,19 @@ #![allow(clippy::tabs_in_doc_comments)] -use std::{collections::HashMap, error::Error, fmt, str::FromStr}; +use std::{error::Error, fmt, str::FromStr}; pub mod cli; pub mod parsers; +pub mod token; -pub use crate::parsers::{parse_file, parse_sentence, parse_token}; +pub use token::{Dep, Token, TokenID}; + +pub use parsers::{parse_file, parse_sentence, parse_token}; pub struct Feature<'a>(pub &'a str, pub &'a str); -#[derive(Debug)] +#[derive(Debug, PartialEq, Eq)] pub struct ParseUposError; impl fmt::Display for ParseUposError { @@ -100,36 +103,7 @@ impl FromStr for UPOS { } } -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum TokenID { - Single(usize), - Range(usize, usize), - Subordinate { major: usize, minor: usize }, -} - -type Features = HashMap; - -#[derive(Debug, Clone, PartialEq)] -pub struct Token { - pub id: TokenID, - pub form: String, - pub lemma: Option, - pub upos: Option, - pub xpos: Option, - pub features: Option, - pub head: Option, - pub deprel: Option, - pub dep: Option>, - pub misc: Option, -} - -#[derive(Debug, Clone, PartialEq)] -pub struct Dep { - pub head: TokenID, - pub rel: String, -} - -#[derive(Debug, Clone)] +#[derive(Debug, Clone, PartialEq, Eq)] pub struct Sentence { pub meta: Vec, pub tokens: Vec, diff --git a/src/parsers.rs b/src/parsers.rs index e8786bc..ff02fed 100644 --- a/src/parsers.rs +++ b/src/parsers.rs @@ -1,4 +1,3 @@ -use crate::{Dep, ParseUposError, Sentence, Token, TokenID, UPOS}; use std::{ collections::HashMap, fs::File, @@ -9,7 +8,12 @@ use std::{ }; use thiserror::Error; -#[derive(Error, PartialEq, Debug)] +use crate::{ + token::{Dep, Token, TokenID}, + ParseUposError, Sentence, UPOS, +}; + +#[derive(Error, PartialEq, Debug, Eq)] pub enum ParseIdError { #[error("Range must be two integers separated by -")] InvalidRange, @@ -20,7 +24,7 @@ pub enum ParseIdError { }, } -#[derive(Error, Debug)] +#[derive(Error, Debug, PartialEq, Eq)] pub enum ParseErrorType { #[error("Missing field: {0}")] MissingField(&'static str), @@ -35,7 +39,7 @@ pub enum ParseErrorType { KeyValueParseError, } -#[derive(Error, Debug)] +#[derive(Error, Debug, PartialEq, Eq)] #[error("Parse error in line {line}: {err}")] pub struct ConlluParseError { line: usize, @@ -69,7 +73,7 @@ pub fn parse_file(file: File) -> Doc> { /// features: None, /// head: Some(TokenID::Single(3)), /// deprel: Some("nmod".to_string()), -/// dep: None, +/// deps: None, /// misc: None /// }); /// ``` @@ -121,10 +125,10 @@ pub fn parse_token(line: &str) -> Result { .ok_or(ParseErrorType::MissingField("deprel"))?; let deprel = placeholder(deprel).map(String::from); - let dep = fields_iter + let deps = fields_iter .next() .ok_or(ParseErrorType::MissingField("deps"))?; - let dep = placeholder_result(dep, parse_deps).transpose()?; + let deps = placeholder_result(deps, parse_deps).transpose()?; let misc = fields_iter .next() @@ -140,7 +144,7 @@ pub fn parse_token(line: &str) -> Result { features, head, deprel, - dep, + deps, misc, }) } @@ -172,10 +176,7 @@ fn parse_id(field: &str) -> Result { return match sep { '-' => Ok(TokenID::Range(ids[0], ids[1])), - '.' => Ok(TokenID::Subordinate { - major: ids[0], - minor: ids[1], - }), + '.' => Ok(TokenID::Empty(ids[0], ids[1])), _ => panic!(), }; } @@ -253,6 +254,37 @@ pub fn parse_sentence(input: &str) -> Result { Ok(Sentence { meta, tokens }) } +/// A `Doc` is a wrapper around a type that implements [BufRead] and produces +/// lines in ConLL-U format that can be parsed into sentences, which +/// can be accessed via iteration. +/// +/// For the common use case of parsing a file in CoNLL-U format, +/// this crate provides the convenience function [parse_file], which produces a `Doc>`. +/// +/// ```rust +/// use std::io::BufReader; +/// use rs_conllu::{Sentence, Token, TokenID}; +/// use rs_conllu::parsers::Doc; +/// +/// let conllu = "1\tSue\t_\t_\t_\t_\t_\t_\t_\t_ +/// 2\tlikes\t_\t_\t_\t_\t_\t_\t_\t_ +/// 3\tcoffee\t_\t_\t_\t_\t_\t_\t_\t_ +/// ".as_bytes(); +/// +/// let reader = BufReader::new(conllu); +/// +/// let mut doc = Doc::new(reader); +/// +/// assert_eq!(doc.next(), Some(Ok(Sentence { +/// meta: vec![], +/// tokens: vec![ +/// Token::builder(TokenID::Single(1), "Sue".to_string()).build(), +/// Token::builder(TokenID::Single(2), "likes".to_string()).build(), +/// Token::builder(TokenID::Single(3), "coffee".to_string()).build(), +/// ] +/// }))); +/// ``` +/// pub struct Doc { reader: T, line_num: usize, @@ -315,7 +347,7 @@ impl Iterator for Doc { mod test { use std::collections::HashMap; - use crate::{Token, TokenID, UPOS}; + use crate::{Token, UPOS}; use super::*; @@ -330,11 +362,8 @@ mod test { } #[test] - fn can_parse_id_subordinate() { - assert_eq!( - parse_id("5.6"), - Ok(TokenID::Subordinate { major: 5, minor: 6 }) - ); + fn can_parse_id_empty() { + assert_eq!(parse_id("5.6"), Ok(TokenID::Empty(5, 6))); } #[test] @@ -358,7 +387,7 @@ mod test { features: Some(features), head: Some(TokenID::Single(3)), deprel: Some("det".to_string()), - dep: None, + deps: None, misc: None, }; diff --git a/src/token.rs b/src/token.rs new file mode 100644 index 0000000..9b70fd8 --- /dev/null +++ b/src/token.rs @@ -0,0 +1,175 @@ +use std::collections::HashMap; + +use crate::UPOS; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum TokenID { + /// The standard, single index. + Single(usize), + /// A range of tokens that form an ID. Denoted by a hyphen + /// in CoNLL-U format (e.g. 1-3). + Range(usize, usize), + /// To represent ellipses, ConLL-U allows to create sub-indices of the preceding + /// regular node (or 0 if it is a the beginning of a sentence). They are separated + /// by a decimal point and represent an "empty" node. + Empty(usize, usize), +} + +type Features = HashMap; + +/// A `Token` is the basic unit of what is defined on a (non-comment) line in CoNLL-U format. +/// The ConLL-U specification uses the terms _word_, _node_ and _multi-word token_ while this crate +/// decided to use the general notion of _Token_ to subsume all of the above. +/// +/// The fields of a `Token` are the ten fields that are defined in the CoNLL-U specification. +/// The only mandatory fields are [id](Token::id) and [form](Token::form). The remaining ones are optional (absence denoted +/// by an underscore in the text format) and represented as [Option] types. +/// +/// A [TokenBuilder] type is available for more convenient creation of [Token] structs, +/// which can be instantiated via the [builder](Token::builder) method. +/// +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Token { + /// The id of the token within the sentence. + pub id: TokenID, + /// The surface form of the token as it appears in the sentence. + pub form: String, + /// The lemma or lexical form of the token. + pub lemma: Option, + /// The universal POS tag of the token. + pub upos: Option, + /// Language-specific POS tag for the token. + pub xpos: Option, + /// Morphological features of the token as key-value pairs. + pub features: Option, + /// The head of the current token. + pub head: Option, + /// The dependency relation fo the token. + pub deprel: Option, + /// Enhanced dependency graph information. + pub deps: Option>, + /// Other types of annotation. + pub misc: Option, +} + +impl Token { + /// Return a new [TokenBuilder]. + pub fn builder(id: TokenID, form: String) -> TokenBuilder { + TokenBuilder::new(id, form) + } +} + +/// A builder for Tokens to allow for more convenient manual creation if necessary. +/// +/// ```rust +/// use rs_conllu::{Token, TokenID}; +/// +/// // Get a new builder from Token +/// let token = Token::builder(TokenID::Single(1), "Hello".to_string()) +/// .lemma("Hello".to_string()) +/// .build(); +/// +/// ``` +pub struct TokenBuilder { + id: TokenID, + form: String, + lemma: Option, + upos: Option, + xpos: Option, + features: Option, + head: Option, + deprel: Option, + deps: Option>, + misc: Option, +} + +impl TokenBuilder { + /// Contstructor for [TokenBuilder]. Both `id` and `form` are mandatory + /// fields and thus required when instantiating. + pub fn new(id: TokenID, form: String) -> TokenBuilder { + TokenBuilder { + id, + form, + lemma: None, + upos: None, + xpos: None, + features: None, + head: None, + deprel: None, + deps: None, + misc: None, + } + } + + /// Set the lemma field. + pub fn lemma(mut self, lemma: String) -> TokenBuilder { + self.lemma = Some(lemma); + self + } + + /// Set the universal POS tag field. + pub fn upos(mut self, upos: UPOS) -> TokenBuilder { + self.upos = Some(upos); + self + } + + /// Set the xpos field. + pub fn xpos(mut self, xpos: String) -> TokenBuilder { + self.xpos = Some(xpos); + self + } + + /// Set the features field. + pub fn features(mut self, features: Features) -> TokenBuilder { + self.features = Some(features); + self + } + + /// Set the head field. + pub fn head(mut self, head: TokenID) -> TokenBuilder { + self.head = Some(head); + self + } + + /// Set the deprel field. + pub fn deprel(mut self, deprel: String) -> TokenBuilder { + self.deprel = Some(deprel); + self + } + + /// Set the deps field. + pub fn deps(mut self, dep: Vec) -> TokenBuilder { + self.deps = Some(dep); + self + } + + /// Set the misc field. + pub fn misc(mut self, misc: String) -> TokenBuilder { + self.misc = Some(misc); + self + } + + /// Build the token. + pub fn build(self) -> Token { + Token { + id: self.id, + form: self.form, + lemma: self.lemma, + upos: self.upos, + xpos: self.xpos, + features: self.features, + head: self.head, + deprel: self.deprel, + deps: self.deps, + misc: self.misc, + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Dep { + /// The head of the relation. + pub head: TokenID, + /// The type of the relation. + pub rel: String, +} diff --git a/tests/file_parse_test.rs b/tests/file_parse_test.rs index b58b525..cdcc891 100644 --- a/tests/file_parse_test.rs +++ b/tests/file_parse_test.rs @@ -1,6 +1,6 @@ use std::{collections::HashMap, fs::File}; -use rs_conllu::{parse_file, Dep, Token, TokenID, UPOS}; +use rs_conllu::{parse_file, token::Dep, token::Token, token::TokenID, UPOS}; #[test] fn test_file_parse() { @@ -26,7 +26,7 @@ fn test_file_parse() { ])), head: Some(TokenID::Single(2)), deprel: Some("nsubj".to_string()), - dep: Some(vec![ + deps: Some(vec![ Dep { head: TokenID::Single(2), rel: "nsubj".to_string()