From 67ad0533a75de22044846aba6451b05e08085131 Mon Sep 17 00:00:00 2001 From: AlexanderKaraberov Date: Sun, 18 Apr 2021 17:51:36 +0200 Subject: [PATCH] Initial implementation --- .gitignore | 23 +++++++++++++ CHANGELOG.md | 5 +++ Data/Time/UTCTimes.hs | 34 ++++++++++++++++++++ LICENSE | 21 ++++++++++++ Main.hs | 24 ++++++++++++++ README.md | 13 ++++++++ Setup.hs | 2 ++ System/FileSearch.hs | 75 +++++++++++++++++++++++++++++++++++++++++++ hlogsearch.cabal | 21 ++++++++++++ 9 files changed, 218 insertions(+) create mode 100644 .gitignore create mode 100644 CHANGELOG.md create mode 100644 Data/Time/UTCTimes.hs create mode 100644 LICENSE create mode 100644 Main.hs create mode 100644 README.md create mode 100644 Setup.hs create mode 100644 System/FileSearch.hs create mode 100644 hlogsearch.cabal diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4c9e245 --- /dev/null +++ b/.gitignore @@ -0,0 +1,23 @@ +dist +dist-* +cabal-dev +*.o +*.hi +*.hie +*.chi +*.chs.h +*.dyn_o +*.dyn_hi +.hpc +.hsenv +.cabal-sandbox/ +cabal.sandbox.config +*.prof +*.aux +*.hp +*.eventlog +.stack-work/ +cabal.project.local +cabal.project.local~ +.HTF/ +.ghc.environment.* diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..97c8c52 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,5 @@ +# Revision history for hlogsearch + +## 0.1.0.0 -- 2021-04-18 + +* First version. Implemented basic search for a log entry's timestamp using an offset in the the file as an index diff --git a/Data/Time/UTCTimes.hs b/Data/Time/UTCTimes.hs new file mode 100644 index 0000000..e0d9fdd --- /dev/null +++ b/Data/Time/UTCTimes.hs @@ -0,0 +1,34 @@ +module Data.Time.UTCTimes ( + compareLogTimes, + mkUTCTime, + zeroUTCJulianDay +) where + +import Data.Time.Clock (UTCTime (UTCTime), diffUTCTime, NominalDiffTime) +import Data.Time.ISO8601 ( parseISO8601 ) +import Data.Fixed (Pico) +import Data.Time.Calendar (fromGregorian) +import Data.Time.LocalTime (LocalTime, ZonedTime, timeOfDayToTime, TimeOfDay(TimeOfDay)) + +-- Compare log entries timestamps using an arbitrary (up to a microsecond) precision +compareLogTimes :: UTCTime -> UTCTime -> NominalDiffTime -> Ordering +compareLogTimes fst snd epsilon + | diff > epsilon = GT + | diff < epsilon && diff > (-epsilon) = EQ + | diff < (-epsilon) = LT + where diff = diffUTCTime fst snd + +-- Construct UTC time +mkUTCTime :: (Integer, Int, Int) + -> (Int, Int, Pico) + -> UTCTime +mkUTCTime (year, mon, day) (hour, min, sec) = + UTCTime (fromGregorian year mon day) + (timeOfDayToTime (TimeOfDay hour min sec)) + +-- Constructs a zero Julian day as UTCTime type. +-- You may use this if you need a more or less reasonable default when working with Maybe UTCTime. +-- It uses a VAX/VMS base time (November 17 1858) as a zero day. +-- http://shanebow.com/page/show/julian-day +zeroUTCJulianDay :: UTCTime +zeroUTCJulianDay = mkUTCTime (1858, 11, 17) (0, 0, 0.000000) diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..ff1eef1 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 Oleksandr Karaberov + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/Main.hs b/Main.hs new file mode 100644 index 0000000..8acd78b --- /dev/null +++ b/Main.hs @@ -0,0 +1,24 @@ +module Main where + +import System.FileSearch (searchLog, readLogLine) +import Data.Time.UTCTimes (mkUTCTime) +import Data.Maybe (fromMaybe) +import Data.Time.Clock (UTCTime (UTCTime)) + +main :: IO () +main = do + -- Let's assume we need to examine a very narrow window in an old and very large log file (50GB in size). + + -- set search parameters (N.B. naturally these should be passed as CLI args). + -- log date we are looking for, or something close to it if this exact timestamp does not exist. + let searchTerm = mkUTCTime (2019, 12, 12) (10, 34, 28.909266) :: UTCTime + -- log file to use for search + let path = "/var/log/cassandra/gc.log" + -- now perform a search. + -- (if you need a more precise than 1 millisecond resolution, then consider using searchLog'). + result <- searchLog path searchTerm + -- unpack search result + let (found, offset) = fromMaybe (searchTerm, 0) result + -- print a found log entry + entry <- readLogLine path offset + putStrLn $ "Entry with time: " ++ show found ++ " at offset: " ++ show offset ++ " is: \n" ++ entry diff --git a/README.md b/README.md new file mode 100644 index 0000000..898c4ca --- /dev/null +++ b/README.md @@ -0,0 +1,13 @@ +# hlogsearch + +A minimalistic tool written in Haskell to search log entries in arbitrary large and noisy log files by a very precise log datetime (up to 1 microsecond). It can be used either as a standalone tool, or as a helper library to build sophisticated log analysis tools, or even as a mere first step in your larger UNIX pipeline to help analyze logs such as: + +```sh +hlogsearch "2021-03-14T14:10:03.000000" gc.log | grep -A 100 -B 100 | awk '{print $9}' | sort | uniq -c +``` + +## Usage + +`cabal run` is all you need. + +Refer to `Main.hs` for an example on how to use this tool. `hlogsearch` expects unstructured log file formats with dates printed in [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601) format. diff --git a/Setup.hs b/Setup.hs new file mode 100644 index 0000000..9a994af --- /dev/null +++ b/Setup.hs @@ -0,0 +1,2 @@ +import Distribution.Simple +main = defaultMain diff --git a/System/FileSearch.hs b/System/FileSearch.hs new file mode 100644 index 0000000..c1a7de3 --- /dev/null +++ b/System/FileSearch.hs @@ -0,0 +1,75 @@ +{-# LANGUAGE MultiWayIf #-} + +module System.FileSearch ( + searchLog, + searchLog', + searchLogUntil, + readLogLine +) where + +import System.IO + ( hSeek, + openFile, + hGetLine, + SeekMode(AbsoluteSeek), + IOMode(ReadMode) ) +import Data.Maybe (fromMaybe) +import Data.Time.Clock (UTCTime (UTCTime), diffUTCTime, NominalDiffTime) +import Data.Time.UTCTimes (compareLogTimes, mkUTCTime, zeroUTCJulianDay) +import System.PosixCompat (getFileStatus) +import System.PosixCompat.Files (fileSize) +import Data.Time.ISO8601 ( parseISO8601 ) + +-- Search a log file with 1 millisecond precision searching the whole log file until eof. +searchLog :: String -> UTCTime -> IO (Maybe (UTCTime, Integer)) +searchLog file key = do + fileSize <- getFileSize file + -- assume one millisecond log precision as a sensible default. + let precision = 0.001 + searchLogUntil file key fileSize precision + +-- Same as searchLog but allow to specify a custom log time precision. +searchLog' :: String -> UTCTime -> NominalDiffTime -> IO (Maybe (UTCTime, Integer)) +searchLog' file key precision = do + fileSize <- getFileSize file + searchLogUntil file key fileSize precision + +-- A more customizable variant of searchLog with additional params notably: +-- high : defines an arbitrary offset to use as a search boundary. +-- precision: defines a custom resolution used to compare log entries' timestamps. +searchLogUntil :: [Char] -> UTCTime -> Integer -> NominalDiffTime -> IO (Maybe (UTCTime, Integer)) +searchLogUntil file key high precision = + go file key 0 high where + go _ _ low high = do + let mid = (high + low) `div` 2 + pivot <- getLogTime file mid + if | high < low -> return Nothing + | compareLogTimes pivot key precision == GT -> go file key low (mid - 1) + | compareLogTimes pivot key precision == LT -> go file key (mid + 1) high + | otherwise -> return $ Just (pivot, mid) + +-- Read a line from a file specified via path at a given offset. +readLogLine :: FilePath -> Integer -> IO String +readLogLine p offset = do + hdl <- openFile p ReadMode + hSeek hdl AbsoluteSeek offset + l <- hGetLine hdl + let diff = toInteger $ length l + 1 + hSeek hdl AbsoluteSeek (diff + offset) + hGetLine hdl + +-- Internal auxiliary functions + +getLogTime :: FilePath -> Integer -> IO UTCTime +getLogTime = (utcTimeFromLog .) . readLogLine + +utcTimeFromLog :: IO String -> IO UTCTime +utcTimeFromLog log = do + logEntry <- log + let _:time:_ = words logEntry + return $ fromMaybe zeroUTCJulianDay $ parseISO8601 time + +getFileSize :: String -> IO Integer +getFileSize path = do + stat <- getFileStatus path + return $ fromIntegral (fileSize stat) diff --git a/hlogsearch.cabal b/hlogsearch.cabal new file mode 100644 index 0000000..b29c0a3 --- /dev/null +++ b/hlogsearch.cabal @@ -0,0 +1,21 @@ +cabal-version: >=1.10 +name: hlogsearch +version: 0.1.0.0 +synopsis: Minimalistic tool to search for log entries in large files. +description: + Allows to find log lines and their position (offset) in large log files using a very + precise log entry datetime as a search key, with up to 1 microsecond precision in case of voluminous and noisy logs. +license: MIT +license-file: LICENSE +author: Oleksandr Karaberov +maintainer: alexander.karaberov@gmail.com +copyright: (c) 2021 Oleksandr Karaberov +category: System Tools +build-type: Simple +extra-source-files: CHANGELOG.md + +executable hlogsearch + main-is: Main.hs + other-modules: System.FileSearch, Data.Time.UTCTimes + build-depends: base >=4.14 && <4.15, time, unix-compat, iso8601-time + default-language: Haskell2010