Skip to content

Commit

Permalink
Merge pull request dahelb#5 from davidhelbig/feat/token-builder
Browse files Browse the repository at this point in the history
Refactoring, adding TokenBuilder plus functionality and documentation
  • Loading branch information
dahelb authored Aug 10, 2024
2 parents d759a70 + 197e850 commit 0483a8a
Show file tree
Hide file tree
Showing 4 changed files with 232 additions and 54 deletions.
40 changes: 7 additions & 33 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,19 @@
#![allow(clippy::tabs_in_doc_comments)]

use std::{collections::HashMap, error::Error, fmt, str::FromStr};
use std::{error::Error, fmt, str::FromStr};

pub mod cli;
pub mod parsers;
pub mod token;

pub use crate::parsers::{parse_file, parse_sentence, parse_token};
pub use token::{Dep, Token, TokenID};

pub use parsers::{parse_file, parse_sentence, parse_token};

pub struct Feature<'a>(pub &'a str, pub &'a str);

#[derive(Debug)]
#[derive(Debug, PartialEq, Eq)]
pub struct ParseUposError;

impl fmt::Display for ParseUposError {
Expand Down Expand Up @@ -100,36 +103,7 @@ impl FromStr for UPOS {
}
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TokenID {
Single(usize),
Range(usize, usize),
Subordinate { major: usize, minor: usize },
}

type Features = HashMap<String, String>;

#[derive(Debug, Clone, PartialEq)]
pub struct Token {
pub id: TokenID,
pub form: String,
pub lemma: Option<String>,
pub upos: Option<UPOS>,
pub xpos: Option<String>,
pub features: Option<Features>,
pub head: Option<TokenID>,
pub deprel: Option<String>,
pub dep: Option<Vec<Dep>>,
pub misc: Option<String>,
}

#[derive(Debug, Clone, PartialEq)]
pub struct Dep {
pub head: TokenID,
pub rel: String,
}

#[derive(Debug, Clone)]
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Sentence {
pub meta: Vec<String>,
pub tokens: Vec<Token>,
Expand Down
67 changes: 48 additions & 19 deletions src/parsers.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use crate::{Dep, ParseUposError, Sentence, Token, TokenID, UPOS};
use std::{
collections::HashMap,
fs::File,
Expand All @@ -9,7 +8,12 @@ use std::{
};
use thiserror::Error;

#[derive(Error, PartialEq, Debug)]
use crate::{
token::{Dep, Token, TokenID},
ParseUposError, Sentence, UPOS,
};

#[derive(Error, PartialEq, Debug, Eq)]
pub enum ParseIdError {
#[error("Range must be two integers separated by -")]
InvalidRange,
Expand All @@ -20,7 +24,7 @@ pub enum ParseIdError {
},
}

#[derive(Error, Debug)]
#[derive(Error, Debug, PartialEq, Eq)]
pub enum ParseErrorType {
#[error("Missing field: {0}")]
MissingField(&'static str),
Expand All @@ -35,7 +39,7 @@ pub enum ParseErrorType {
KeyValueParseError,
}

#[derive(Error, Debug)]
#[derive(Error, Debug, PartialEq, Eq)]
#[error("Parse error in line {line}: {err}")]
pub struct ConlluParseError {
line: usize,
Expand Down Expand Up @@ -69,7 +73,7 @@ pub fn parse_file(file: File) -> Doc<BufReader<File>> {
/// features: None,
/// head: Some(TokenID::Single(3)),
/// deprel: Some("nmod".to_string()),
/// dep: None,
/// deps: None,
/// misc: None
/// });
/// ```
Expand Down Expand Up @@ -121,10 +125,10 @@ pub fn parse_token(line: &str) -> Result<Token, ParseErrorType> {
.ok_or(ParseErrorType::MissingField("deprel"))?;
let deprel = placeholder(deprel).map(String::from);

let dep = fields_iter
let deps = fields_iter
.next()
.ok_or(ParseErrorType::MissingField("deps"))?;
let dep = placeholder_result(dep, parse_deps).transpose()?;
let deps = placeholder_result(deps, parse_deps).transpose()?;

let misc = fields_iter
.next()
Expand All @@ -140,7 +144,7 @@ pub fn parse_token(line: &str) -> Result<Token, ParseErrorType> {
features,
head,
deprel,
dep,
deps,
misc,
})
}
Expand Down Expand Up @@ -172,10 +176,7 @@ fn parse_id(field: &str) -> Result<TokenID, ParseIdError> {

return match sep {
'-' => Ok(TokenID::Range(ids[0], ids[1])),
'.' => Ok(TokenID::Subordinate {
major: ids[0],
minor: ids[1],
}),
'.' => Ok(TokenID::Empty(ids[0], ids[1])),
_ => panic!(),
};
}
Expand Down Expand Up @@ -253,6 +254,37 @@ pub fn parse_sentence(input: &str) -> Result<Sentence, ConlluParseError> {
Ok(Sentence { meta, tokens })
}

/// A `Doc` is a wrapper around a type that implements [BufRead] and produces
/// lines in ConLL-U format that can be parsed into sentences, which
/// can be accessed via iteration.
///
/// For the common use case of parsing a file in CoNLL-U format,
/// this crate provides the convenience function [parse_file], which produces a `Doc<BufReader<File>>`.
///
/// ```rust
/// use std::io::BufReader;
/// use rs_conllu::{Sentence, Token, TokenID};
/// use rs_conllu::parsers::Doc;
///
/// let conllu = "1\tSue\t_\t_\t_\t_\t_\t_\t_\t_
/// 2\tlikes\t_\t_\t_\t_\t_\t_\t_\t_
/// 3\tcoffee\t_\t_\t_\t_\t_\t_\t_\t_
/// ".as_bytes();
///
/// let reader = BufReader::new(conllu);
///
/// let mut doc = Doc::new(reader);
///
/// assert_eq!(doc.next(), Some(Ok(Sentence {
/// meta: vec![],
/// tokens: vec![
/// Token::builder(TokenID::Single(1), "Sue".to_string()).build(),
/// Token::builder(TokenID::Single(2), "likes".to_string()).build(),
/// Token::builder(TokenID::Single(3), "coffee".to_string()).build(),
/// ]
/// })));
/// ```
///
pub struct Doc<T: BufRead> {
reader: T,
line_num: usize,
Expand Down Expand Up @@ -315,7 +347,7 @@ impl<T: BufRead> Iterator for Doc<T> {
mod test {
use std::collections::HashMap;

use crate::{Token, TokenID, UPOS};
use crate::{Token, UPOS};

use super::*;

Expand All @@ -330,11 +362,8 @@ mod test {
}

#[test]
fn can_parse_id_subordinate() {
assert_eq!(
parse_id("5.6"),
Ok(TokenID::Subordinate { major: 5, minor: 6 })
);
fn can_parse_id_empty() {
assert_eq!(parse_id("5.6"), Ok(TokenID::Empty(5, 6)));
}

#[test]
Expand All @@ -358,7 +387,7 @@ mod test {
features: Some(features),
head: Some(TokenID::Single(3)),
deprel: Some("det".to_string()),
dep: None,
deps: None,
misc: None,
};

Expand Down
175 changes: 175 additions & 0 deletions src/token.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
use std::collections::HashMap;

use crate::UPOS;

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TokenID {
/// The standard, single index.
Single(usize),
/// A range of tokens that form an ID. Denoted by a hyphen
/// in CoNLL-U format (e.g. 1-3).
Range(usize, usize),
/// To represent ellipses, ConLL-U allows to create sub-indices of the preceding
/// regular node (or 0 if it is a the beginning of a sentence). They are separated
/// by a decimal point and represent an "empty" node.
Empty(usize, usize),
}

type Features = HashMap<String, String>;

/// A `Token` is the basic unit of what is defined on a (non-comment) line in CoNLL-U format.
/// The ConLL-U specification uses the terms _word_, _node_ and _multi-word token_ while this crate
/// decided to use the general notion of _Token_ to subsume all of the above.
///
/// The fields of a `Token` are the ten fields that are defined in the CoNLL-U specification.
/// The only mandatory fields are [id](Token::id) and [form](Token::form). The remaining ones are optional (absence denoted
/// by an underscore in the text format) and represented as [Option] types.
///
/// A [TokenBuilder] type is available for more convenient creation of [Token] structs,
/// which can be instantiated via the [builder](Token::builder) method.
///
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Token {
/// The id of the token within the sentence.
pub id: TokenID,
/// The surface form of the token as it appears in the sentence.
pub form: String,
/// The lemma or lexical form of the token.
pub lemma: Option<String>,
/// The universal POS tag of the token.
pub upos: Option<UPOS>,
/// Language-specific POS tag for the token.
pub xpos: Option<String>,
/// Morphological features of the token as key-value pairs.
pub features: Option<Features>,
/// The head of the current token.
pub head: Option<TokenID>,
/// The dependency relation fo the token.
pub deprel: Option<String>,
/// Enhanced dependency graph information.
pub deps: Option<Vec<Dep>>,
/// Other types of annotation.
pub misc: Option<String>,
}

impl Token {
/// Return a new [TokenBuilder].
pub fn builder(id: TokenID, form: String) -> TokenBuilder {
TokenBuilder::new(id, form)
}
}

/// A builder for Tokens to allow for more convenient manual creation if necessary.
///
/// ```rust
/// use rs_conllu::{Token, TokenID};
///
/// // Get a new builder from Token
/// let token = Token::builder(TokenID::Single(1), "Hello".to_string())
/// .lemma("Hello".to_string())
/// .build();
///
/// ```
pub struct TokenBuilder {
id: TokenID,
form: String,
lemma: Option<String>,
upos: Option<UPOS>,
xpos: Option<String>,
features: Option<Features>,
head: Option<TokenID>,
deprel: Option<String>,
deps: Option<Vec<Dep>>,
misc: Option<String>,
}

impl TokenBuilder {
/// Contstructor for [TokenBuilder]. Both `id` and `form` are mandatory
/// fields and thus required when instantiating.
pub fn new(id: TokenID, form: String) -> TokenBuilder {
TokenBuilder {
id,
form,
lemma: None,
upos: None,
xpos: None,
features: None,
head: None,
deprel: None,
deps: None,
misc: None,
}
}

/// Set the lemma field.
pub fn lemma(mut self, lemma: String) -> TokenBuilder {
self.lemma = Some(lemma);
self
}

/// Set the universal POS tag field.
pub fn upos(mut self, upos: UPOS) -> TokenBuilder {
self.upos = Some(upos);
self
}

/// Set the xpos field.
pub fn xpos(mut self, xpos: String) -> TokenBuilder {
self.xpos = Some(xpos);
self
}

/// Set the features field.
pub fn features(mut self, features: Features) -> TokenBuilder {
self.features = Some(features);
self
}

/// Set the head field.
pub fn head(mut self, head: TokenID) -> TokenBuilder {
self.head = Some(head);
self
}

/// Set the deprel field.
pub fn deprel(mut self, deprel: String) -> TokenBuilder {
self.deprel = Some(deprel);
self
}

/// Set the deps field.
pub fn deps(mut self, dep: Vec<Dep>) -> TokenBuilder {
self.deps = Some(dep);
self
}

/// Set the misc field.
pub fn misc(mut self, misc: String) -> TokenBuilder {
self.misc = Some(misc);
self
}

/// Build the token.
pub fn build(self) -> Token {
Token {
id: self.id,
form: self.form,
lemma: self.lemma,
upos: self.upos,
xpos: self.xpos,
features: self.features,
head: self.head,
deprel: self.deprel,
deps: self.deps,
misc: self.misc,
}
}
}

#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Dep {
/// The head of the relation.
pub head: TokenID,
/// The type of the relation.
pub rel: String,
}
Loading

0 comments on commit 0483a8a

Please sign in to comment.