From 85f76e6d35f0893393f12357f00f0881fc2bac73 Mon Sep 17 00:00:00 2001 From: Bayram Date: Sun, 10 Nov 2024 23:38:35 +0100 Subject: [PATCH] Add freqs --- .gitignore | 6 +- freqs/Cargo.lock | 456 ++++++++++++++++++++++++++++++++++++++++++++ freqs/Cargo.toml | 8 + freqs/README.md | 15 ++ freqs/src/config.rs | 24 +++ freqs/src/lib.rs | 1 + freqs/src/main.rs | 94 +++++++++ wcr/.DS_Store | Bin 6148 -> 0 bytes 8 files changed, 603 insertions(+), 1 deletion(-) create mode 100644 freqs/Cargo.lock create mode 100644 freqs/Cargo.toml create mode 100644 freqs/README.md create mode 100644 freqs/src/config.rs create mode 100644 freqs/src/lib.rs create mode 100644 freqs/src/main.rs delete mode 100644 wcr/.DS_Store diff --git a/.gitignore b/.gitignore index b60de5b..e8e63dc 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,5 @@ -**/target +/**/target +/**/.DS_Store +/**/*.idea +/**/*.txt +/**/*.csv \ No newline at end of file diff --git a/freqs/Cargo.lock b/freqs/Cargo.lock new file mode 100644 index 0000000..cde286e --- /dev/null +++ b/freqs/Cargo.lock @@ -0,0 +1,456 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "addr2line" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1" +dependencies = [ + "gimli", +] + +[[package]] +name = "adler2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" + +[[package]] +name = "anstream" +version = "0.6.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" + +[[package]] +name = "anstyle-parse" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" +dependencies = [ + "windows-sys 0.59.0", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125" +dependencies = [ + "anstyle", + "windows-sys 0.59.0", +] + +[[package]] +name = "autocfg" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" + +[[package]] +name = "backtrace" +version = "0.3.74" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d82cb332cdfaed17ae235a638438ac4d4839913cc2af585c3c6746e8f8bee1a" +dependencies = [ + "addr2line", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", + "windows-targets", +] + +[[package]] +name = "bitflags" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" + +[[package]] +name = "bytes" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ac0150caa2ae65ca5bd83f25c7de183dea78d4d366469f148435e2acfbad0da" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "clap" +version = "4.5.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b97f376d85a664d5837dbae44bf546e6477a679ff6610010f17276f686d867e8" +dependencies = [ + "clap_builder", +] + +[[package]] +name = "clap_builder" +version = "4.5.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19bc80abd44e4bed93ca373a0704ccbd1b710dc5749406201bb018272808dc54" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_lex" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97" + +[[package]] +name = "colorchoice" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" + +[[package]] +name = "freqs" +version = "0.1.0" +dependencies = [ + "clap", + "tokio", +] + +[[package]] +name = "gimli" +version = "0.31.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" + +[[package]] +name = "hermit-abi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" + +[[package]] +name = "libc" +version = "0.2.162" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18d287de67fe55fd7e1581fe933d965a5a9477b38e949cfa9f8574ef01506398" + +[[package]] +name = "lock_api" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" +dependencies = [ + "autocfg", + "scopeguard", +] + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "miniz_oxide" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1" +dependencies = [ + "adler2", +] + +[[package]] +name = "mio" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "80e04d1dcff3aae0704555fe5fee3bcfaf3d1fdf8a7e521d5b9d2b42acb52cec" +dependencies = [ + "hermit-abi", + "libc", + "wasi", + "windows-sys 0.52.0", +] + +[[package]] +name = "object" +version = "0.36.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aedf0a2d09c573ed1d8d85b30c119153926a2b36dce0ab28322c09a117a4683e" +dependencies = [ + "memchr", +] + +[[package]] +name = "parking_lot" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-targets", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "915a1e146535de9163f3987b8944ed8cf49a18bb0056bcebcdcece385cece4ff" + +[[package]] +name = "proc-macro2" +version = "1.0.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "redox_syscall" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b6dfecf2c74bce2466cabf93f6664d6998a69eb21e39f4207930065b27b771f" +dependencies = [ + "bitflags", +] + +[[package]] +name = "rustc-demangle" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "signal-hook-registry" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9e9e0b4211b72e7b8b6e85c807d36c212bdb33ea8587f7569562a84df5465b1" +dependencies = [ + "libc", +] + +[[package]] +name = "smallvec" +version = "1.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" + +[[package]] +name = "socket2" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "syn" +version = "2.0.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tokio" +version = "1.41.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22cfb5bee7a6a52939ca9224d6ac897bb669134078daa8735560897f69de4d33" +dependencies = [ + "backtrace", + "bytes", + "libc", + "mio", + "parking_lot", + "pin-project-lite", + "signal-hook-registry", + "socket2", + "tokio-macros", + "windows-sys 0.52.0", +] + +[[package]] +name = "tokio-macros" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "unicode-ident" +version = "1.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" diff --git a/freqs/Cargo.toml b/freqs/Cargo.toml new file mode 100644 index 0000000..0a47909 --- /dev/null +++ b/freqs/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "freqs" +version = "0.1.0" +edition = "2021" + +[dependencies] +clap = "4.5.20" +tokio = { version = "1.41.1", features = ["full"] } diff --git a/freqs/README.md b/freqs/README.md new file mode 100644 index 0000000..783c79d --- /dev/null +++ b/freqs/README.md @@ -0,0 +1,15 @@ +A `channel` is a one-way conduit for sending values from one thread to another. +In other words, it’s a __thread-safe queue__. + +With channels, threads can communicate by passing values to one another. +It’s a very simple way for threads to work together without using locking or shared memory. + +Rust channels are faster than Unix pipes. Sending a value moves it rather than copying it, +and moves are fast even when you’re moving data structures that contain many megabytes of data. + +This project is an example of how to use `tokio` channels to create a pipeline of `tokio` tasks. +The pipeline works as following +``` +[ Read file ] => [ Split into words ] => [ Split into letters and store their frequency in a HashMap ] +``` + diff --git a/freqs/src/config.rs b/freqs/src/config.rs new file mode 100644 index 0000000..e43228b --- /dev/null +++ b/freqs/src/config.rs @@ -0,0 +1,24 @@ +use clap::{Arg, Command}; + +pub fn get_args() -> Vec { + let matches = build_command().get_matches(); + + if let Some(val) = matches.get_many::("file") { + val.map(|v| v.into()).collect() + } else { + panic!("Missing file names."); + } +} + +fn build_command() -> Command { + Command::new("inindex") + .author("Bayram, bkulyev@gmail.com") + .version("1.0.2") + .about("Creates an inverted index.") + .arg( + Arg::new("file") + .required(true) + .num_args(1..) + .help("List of filenames separated by space.") + ) +} \ No newline at end of file diff --git a/freqs/src/lib.rs b/freqs/src/lib.rs new file mode 100644 index 0000000..a105933 --- /dev/null +++ b/freqs/src/lib.rs @@ -0,0 +1 @@ +pub mod config; \ No newline at end of file diff --git a/freqs/src/main.rs b/freqs/src/main.rs new file mode 100644 index 0000000..d72516f --- /dev/null +++ b/freqs/src/main.rs @@ -0,0 +1,94 @@ +use freqs::config; +use tokio::{fs::File, io::{self, AsyncReadExt}, sync::mpsc::{self, Receiver, Sender}, task::JoinHandle}; +use std::collections::HashMap; +use std::io::Write; + +const FILES_BUFFER_SIZE: usize = 8; +const WORDS_BUFFER_SIZE: usize = 5000; + +async fn read_files(filenames: Vec, sender: Sender) -> io::Result<()> { + for filename in filenames { + let mut file = File::open(filename).await?; + let mut text = String::new(); + file.read_to_string(&mut text).await?; + + if sender.send(text).await.is_err() { + break; + } + } + + Ok(()) +} + +async fn split_into_words(sender: Sender, mut texts_recv: Receiver) { + while let Some(string) = texts_recv.recv().await { + for word in string.split_ascii_whitespace().collect::>() { + if sender.send(word.to_owned()).await.is_err() { + break; + } + } + } +} + +async fn split_into_letters(mut words_recv: Receiver) -> HashMap{ + let mut freqs: HashMap = HashMap::new(); + + while let Some(word) = words_recv.recv().await { + for letter in word.chars().collect::>() { + if letter.is_alphanumeric() { + if let Some(val) = freqs.get(&letter){ + freqs.insert(letter, val + 1); + } else { + freqs.insert(letter, 1); + } + } + } + } + + freqs +} + +/// Start a thread that reads files into memory. +fn start_file_reader_thread(filenames: Vec) -> Receiver { + let (sender, receiver) = mpsc::channel(FILES_BUFFER_SIZE); + tokio::spawn(read_files(filenames, sender)); + + receiver +} +/// Start a thread that receives text and splits the text into words. +fn start_text_splitter_thread(texts_recv: Receiver) -> Receiver { + let (sender, receiver) = mpsc::channel(WORDS_BUFFER_SIZE); + tokio::spawn(split_into_words(sender, texts_recv) ); + + receiver +} +/// Start a thread that receives words and splits the words into letters. +fn start_word_splitter_thread(words_recv: Receiver) -> JoinHandle> { + tokio::spawn(split_into_letters(words_recv)) +} + +fn run_pipeline(filenames: Vec) -> JoinHandle>{ + let texts_recv = start_file_reader_thread(filenames); + let words_recv = start_text_splitter_thread(texts_recv); + start_word_splitter_thread(words_recv) +} + +fn write_to_file(freqs: HashMap) -> Result<(), tokio::io::Error> { + let mut file = std::fs::File::create("freqs.csv")?; + + for (k,v) in freqs { + write!(file, "{},{}\n", k, v)?; + } + + Ok(()) +} + +#[tokio::main] +async fn main() -> Result<(), tokio::io::Error>{ + let filenames = config::get_args(); + + let freqs = run_pipeline(filenames).await?; + write_to_file(freqs)?; + + Ok(()) +} diff --git a/wcr/.DS_Store b/wcr/.DS_Store deleted file mode 100644 index 5873551b20e159e39c741a62b52a32beec4288bc..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKF=_)r43uIA3~5}t+%Mz@i*a6%4}{q142GQ3UzK;|X`YcpusIIgqzNOCW_M27 z<)%2D%*?l6hi9|3nXTYN`(~ILpVKGyPz=#|#{T$zIvugJ$4T-G!PLv7PX|Jb0Kg^a zFsx&i05%f9UN|O#fq9k!v(#(E@GJ-4Dz6ugiCJzQH{+bT*=s`axE*|pbn~95C