diff --git a/Cargo.toml b/Cargo.toml index b418d56..9d29c62 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,7 +4,7 @@ version = "0.1.1" edition = "2021" readme = "README.md" description = "Rust port of elastic Grok processor" -repository = "https://github.com/GreptimeTeam/promql-parser" +repository = "https://github.com/yuanbohan/grok-rs" authors = ["yuanbohan"] keywords = ["grok", "elastic", "logstash", "ETL"] license = "Apache-2.0" diff --git a/README.md b/README.md index f3c22f4..be7db2f 100644 --- a/README.md +++ b/README.md @@ -2,9 +2,16 @@ [![Version](https://img.shields.io/crates/v/grok-rs?label=grok-rs)](https://crates.io/crates/grok-rs) [![codecov](https://codecov.io/gh/yuanbohan/grok-rs/graph/badge.svg?token=1T8WSFV6BX)](https://codecov.io/gh/yuanbohan/grok-rs) -# grok +# grok_rs -Rust port of Elastic Grok processor, inspired by [grok-go][grok-go] and [grok][grok] +the `grok_rs` is a rust port of Elastic Grok processor, inspired by [grok-go][grok-go] and [grok][grok] + +## Usage + +```toml +[dependencies] +grok-rs = "0.1" +``` ## Example @@ -92,6 +99,10 @@ the output is: } ``` +## Notice + +`grok_rs` is based on [regex][regex] crate, so lacks several features that are not known how to implement efficiently. This includes, but is not limited to, look-around and backreferences. In exchange, all regex searches in this crate have worst case `O(m * n)` time complexity, where `m` is proportional to the size of the regex and `n` is proportional to the size of the string being searched. + ## Elastic Grok compliance This crate declares compatible with [elastic grok patterns v8.14.0][grok-patterns], which is tagged at 2024-06-05. @@ -99,3 +110,4 @@ This crate declares compatible with [elastic grok patterns v8.14.0][grok-pattern [grok-patterns]: https://github.com/elastic/elasticsearch/tree/v8.14.0/libs/grok/src/main/resources/patterns/ecs-v1 [grok-go]: https://github.com/elastic/go-grok [grok]: https://github.com/daschl/grok +[regex]: https://docs.rs/regex/latest/regex diff --git a/src/lib.rs b/src/lib.rs index 4a1550f..6f630ac 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,11 +1,40 @@ +//! grok-rs is a Rust implementation of the [Grok](https://www.elastic.co/guide/en/elasticsearch/reference/current/grok-processor.html) +//! pattern matching library. +//! +//! The captured group can be renamed based on the alias, and the value can be converted to the specified type. +//! The supported types are: +//! - int +//! - long +//! - float +//! - double +//! - bool +//! +//! # Usage +//! +//! Initiate a Grok instance with the default patterns, or add custom patterns,then compile the your pattern, +//! and parse the input string based on the pattern. +//! +//! ``` +//! use std::collections::HashMap; +//! use grok_rs::{Grok, Value}; +//! +//! let mut grok = Grok::default(); +//! grok.add_pattern("NAME", r"[A-z0-9._-]+"); +//! let pattern = grok.compile("%{NAME}", false).unwrap(); +//! let expected: HashMap = [("NAME", "admin")] +//! .into_iter() +//! .map(|(k, v)| (k.to_string(), Value::String(v.to_string()))) +//! .collect(); +//! +//! assert_eq!(expected, pattern.parse("admin").unwrap()); +//! assert_eq!(expected, pattern.parse("admin user").unwrap()); +//! ``` use std::{ collections::HashMap, fs::File, io::{BufRead, BufReader}, }; -use glob::glob; -use lazy_static::lazy_static; use regex::Regex; const MAX_RECURSION: i32 = 1024; @@ -31,7 +60,7 @@ const GROK_PATTERN: &str = r"(?x) fn load_patterns() -> HashMap { let mut patterns = HashMap::new(); - for line in glob("src/patterns/*") + for line in glob::glob("src/patterns/*") .unwrap() .map(|e| File::open(e.unwrap()).unwrap()) .flat_map(|f| BufReader::new(f).lines()) @@ -47,7 +76,7 @@ fn load_patterns() -> HashMap { patterns } -lazy_static! { +lazy_static::lazy_static! { static ref GROK_REGEX: Regex = Regex::new(GROK_PATTERN).unwrap(); static ref DEFAULT_PATTERNS: HashMap = load_patterns(); } @@ -60,17 +89,42 @@ pub enum Value { String(String), } +type AliasType = (String, Option); + #[derive(Debug)] pub struct Pattern { regex: Regex, - alias: HashMap)>, + alias: HashMap, } impl Pattern { - fn new(regex: Regex, alias: HashMap)>) -> Self { + fn new(regex: Regex, alias: HashMap) -> Self { Self { regex, alias } } + /// parse the input string based on the pattern, and rename the captured group based on alias. + /// - if type is specified, then the value will be converted to the specified type. + /// - if the type is not supported, then the value will be kept as string. + /// - if the value can't be converted to the specified type, then an error will be returned. + /// - if the value can't be captured, then an empty map will be returned. + /// + /// # Example + /// ``` + /// use std::collections::HashMap; + /// use grok_rs::{Grok, Value}; + /// + /// let grok = Grok::default(); + /// let pattern = grok.compile("%{USERNAME}", false).unwrap(); + /// let result = pattern.parse("admin admin@example.com").unwrap(); + /// let expected = [("USERNAME", "admin")] + /// .into_iter() + /// .map(|(k, v)| (k.to_string(), Value::String(v.to_string()))) + /// .collect::>(); + /// assert_eq!(expected, result); + /// + /// let empty = pattern.parse("✅").unwrap(); + /// assert!(empty.is_empty()); + /// ``` pub fn parse(&self, s: &str) -> Result, String> { let mut map = HashMap::new(); let names = self.regex.capture_names().flatten().collect::>(); @@ -86,19 +140,16 @@ impl Pattern { match self.alias.get(name) { Some((alias, type_)) => { let value = match type_ { - Some(type_) => match type_.as_str() { - "int" | "long" => Value::Int( - value.parse::().map_err(|e| format!("{e}: {value}"))?, - ), - "float" | "double" => Value::Float( - value.parse::().map_err(|e| format!("{e}: {value}"))?, - ), - "bool" | "boolean" => Value::Bool( - value.parse::().map_err(|e| format!("{e}: {value}"))?, - ), - _ => Value::String(value), - }, - None => Value::String(value), + Some(type_) if type_.eq("int") || type_.eq("long") => Value::Int( + value.parse::().map_err(|e| format!("{e}: {value}"))?, + ), + Some(type_) if type_.eq("float") || type_.eq("double") => Value::Float( + value.parse::().map_err(|e| format!("{e}: {value}"))?, + ), + Some(type_) if type_.eq("bool") || type_.eq("boolean") => Value::Bool( + value.parse::().map_err(|e| format!("{e}: {value}"))?, + ), + _ => Value::String(value), }; map.insert(alias.clone(), value); } @@ -119,13 +170,33 @@ pub struct Grok { } impl Grok { + /// add a custom pattern, if the pattern is already defined, then it will be overwritten. + /// # Example + /// ``` + /// use grok_rs::Grok; + /// + /// let mut grok = Grok::default(); + /// grok.add_pattern("NAME", r"[A-z0-9._-]+"); + /// ``` pub fn add_pattern>(&mut self, name: T, pattern: T) { self.patterns.insert(name.into(), pattern.into()); } - /// if named_capture_only is true, then pattern without alias won't be captured. e.g. - /// if pattern is "%{USERNAME} %{EMAILADDRESS:email}" and named_capture_only is true, - /// then only email will be captured. + /// Compile the pattern, and return a Pattern. + /// - if `named_capture_only` is true, then the unnamed capture group will be ignored. + /// - if the pattern is invalid or not found , then an error will be returned. + /// + /// Due to the compile process is heavy, it's recommended compile the pattern once and reuse it. + /// + /// # Example + /// + /// the USERNAME will be ignored because `named_capture_only` is true. + /// + /// ``` + /// use grok_rs::Grok; + /// let grok = Grok::default(); + /// let pattern = grok.compile("%{USERNAME} %{EMAILADDRESS:email}", true).unwrap(); + /// ``` pub fn compile(&self, s: &str, named_capture_only: bool) -> Result { let mut alias_map = HashMap::new(); let mut haystack = s.to_string();