From 4770992ba7f8445460f4c8b98c8c75a15e9f8060 Mon Sep 17 00:00:00 2001 From: acmepjz Date: Mon, 8 Jul 2024 02:36:50 +0800 Subject: [PATCH] bump BibtexQuery version and delete corresponding files --- DocGen4/Output/Bibtex/Name.lean | 194 ---------------------- DocGen4/Output/Bibtex/TexDiacritics.lean | 200 ----------------------- lake-manifest.json | 2 +- 3 files changed, 1 insertion(+), 395 deletions(-) delete mode 100644 DocGen4/Output/Bibtex/Name.lean delete mode 100644 DocGen4/Output/Bibtex/TexDiacritics.lean diff --git a/DocGen4/Output/Bibtex/Name.lean b/DocGen4/Output/Bibtex/Name.lean deleted file mode 100644 index 928463d7..00000000 --- a/DocGen4/Output/Bibtex/Name.lean +++ /dev/null @@ -1,194 +0,0 @@ -import DocGen4.Output.Bibtex.TexDiacritics -import UnicodeBasic - -/-! - -This file contains functions for bibtex name processing. -Unless stated otherwise, the input string should have no TeX commands -and math equations. Braces are allowed. -An exception is `processNames` which allows input string contains TeX diacritics commands. -The math equations are still not allowed. - --/ - -open Lean Parsec Unicode - -namespace DocGen4.Bibtex - -partial def getNameBracedAux : Parsec String := do - let doOne : Parsec (Option String) := fun it => - if it.hasNext then - match it.curr with - | '{' => (.some <$> bracedContent getNameBracedAux) it - | '}' => .success it .none - | _ => (.some <$> normalChars) it - else - .success it .none - return String.join (← manyOptions doOne).toList - -/-- Input a bibtex name string without TeX commands -and math equations, split the string by " " and ",". -/ -def getNameAux : Parsec (Array String) := do - let normalChars' : Parsec String := many1Chars <| satisfy fun c => - match c with - | '\\' | '$' | '{' | '}' | ' ' | '\t' | '\r' | '\n' | ',' => false - | _ => true - let doOne' : Parsec (Option String) := fun it => - if it.hasNext then - match it.curr with - | '{' => (.some <$> bracedContent getNameBracedAux) it - | '}' | ' ' | '\t' | '\r' | '\n' | ',' => .success it .none - | _ => (.some <$> normalChars') it - else - .success it .none - let doOne : Parsec (Option String) := fun it => - if it.hasNext then - match it.curr with - | '}' => .success it .none - | ' ' | '\t' | '\r' | '\n' => (.some <$> ws') it - | ',' => .success it.next "," - | _ => ((.some <| String.join ·.toList) <$> manyOptions doOne') it - else - .success it .none - let arr ← manyOptions doOne - return arr.filterMap fun s => - let t := s.trim - if t.isEmpty then .none else .some t - -/-- Input a name string already split by spaces and comma, -return `(Firstname, Lastname)`. -The braces in the name are preserevd. Supported formats: - -- `Lastname, Firstname => (Firstname, Lastname)` -- `Firstname {Last {N}ame} => (Firstname, Last {N}ame)` -- `Firstname Lastname => (Firstname, Lastname)` -- `First Name Lastname => (First Name, Lastname)` -- `Firstname last Name => (Firstname, last Name)` --/ -def getName (arr : Array String) : String × String := - if arr.isEmpty then - ("", "") - else - let join' (arr : Subarray String) : String := arr.foldl (fun acc s => - acc ++ (if acc.isEmpty || s.isEmpty || s == "," then "" else " ") ++ s) "" - match arr.getIdx? "," with - | .some n => - (arr.toSubarray.drop (n + 1) |> join', arr.toSubarray.take n |> join') - | .none => - let s := arr.get! (arr.size - 1) - if s.startsWith "{" && s.endsWith "}" then - (arr.toSubarray.take (arr.size - 1) |> join', - s.toSubstring.drop 1 |>.dropRight 1 |>.toString) - else - let lastName := arr.reverse.toSubarray.drop 1 |>.toArray.takeWhile - (fun s => GeneralCategory.isLowercaseLetter s.front) |>.reverse.push s - (arr.toSubarray.take (arr.size - lastName.size) |> join', - lastName.toSubarray |> join') - -/-- Input a bibtex name string without TeX commands -and math equations, return an array of `(Firstname, Lastname)`. -The braces in the name are preserevd. -/ -def getNames : Parsec (Array (String × String)) := do - let arr ← getNameAux - let arr2 : Array (Array String) := arr.foldl (fun acc s => - if s = "and" then - acc.push #[] - else - acc.modify (acc.size - 1) (Array.push · s)) #[#[]] - return arr2.filterMap fun arr => - let ret := getName arr - if ret.1.isEmpty && ret.2.isEmpty then .none else .some ret - -/-- Strip diacritics from a character. -/ -def stripDiacritics (c : Char) : Char := - match c with - | 'œ' => 'o' | 'Œ' => 'O' - | 'æ' => 'a' | 'Æ' => 'A' - | 'å' => 'a' | 'Å' => 'A' - | 'ø' => 'o' | 'Ø' => 'O' - | 'ł' => 'l' | 'Ł' => 'L' - | 'ı' => 'i' | 'ȷ' => 'J' - | '\u00DF' => 's' | '\u1E9E' => 'S' - | _ => - let s := getCanonicalDecomposition c - s.get? (s.find GeneralCategory.isLetter) |>.getD c - -/-- Check if a string is an upper case Roman numerals. -It does not check the validity of the number, for example, it accepts `IXIX`. -/ -def isUppercaseRomanNumerals (s : String) : Bool := - not s.isEmpty && s.all fun c => - match c with - | 'I' | 'V' | 'X' | 'L' | 'C' | 'D' | 'M' => true - | _ => false - -/-- Input a last name string without TeX commands, braces -and math equations, already split by spaces and comma, -return `(oneLetterAbbr, threeLetterAbbr)` of the last name. -/ -def getLastNameAbbr (arr : Array String) : String × String := - let arr := if arr.size ≤ 1 then arr else arr.filter (not <| isUppercaseRomanNumerals ·) - match arr with - | #[] => ("", "") - | #[s] => - let s := s.toList.toArray.map stripDiacritics |>.filter GeneralCategory.isLetter - let arr : Array Nat := s.zipWithIndex.filterMap fun x => - if GeneralCategory.isUppercaseLetter x.1 then .some x.2 else .none - if arr.size ≥ 2 then - if arr[0]! + 2 = arr[1]! then - let s := s.toSubarray.drop arr[0]! |>.take 3 |>.toArray.toList |> String.mk - (s, s) - else - let s := arr.map s.get! |>.toList |> String.mk - (s, s) - else - let s := String.mk s.toList - (s.take 1, s.take 3) - | _ => - let s := arr.filterMap (·.get? 0) |>.toList |> String.mk - (s, s) - -/-- Represents the name of a person in bibtex author field. -/ -structure BibtexName where - firstName : String - lastName : String - oneLetterAbbr : String - threeLetterAbbr : String - deriving Repr - -/-- Process the first name and last name without TeX commands -and math equations, remove all braces in them, and produce -one-letter and three-letter abbreviations from the last name. -/ -def processName (s : String × String) : Except String BibtexName := - let removeBraces' (s : String) : Except String String := - match removeBraces s.iter with - | .success _ s => .ok s - | .error it err => .error s!"failed to run removeBraces on '{it.1}' at pos {it.2}: {err}" - match removeBraces' s.1 with - | .ok firstName => - match removeBraces' s.2 with - | .ok lastName => - match getNameAux s.2.iter with - | .success _ arr => - match arr.mapM removeBraces' with - | .ok arr => - let abbr := getLastNameAbbr <| arr.filter (not ·.trim.isEmpty) - .ok { - firstName := firstName - lastName := lastName - oneLetterAbbr := abbr.1 - threeLetterAbbr := abbr.2 - } - | .error err => .error err - | .error it err => .error s!"failed to run getNameAux on '{it.1}' at pos {it.2}: {err}" - | .error err => .error err - | .error err => .error err - -/-- Input a bibtex name string without math equations, return an array of `BibtexName`. -/ -def processNames (s : String) : Except String (Array BibtexName) := - match texDiacritics s.iter with - | .success _ s => - match getNames s.iter with - | .success _ arr => arr.mapM processName - | .error it err => .error s!"failed to run getNames on '{it.1}' at pos {it.2}: {err}" - | .error it err => .error s!"failed to run texDiacritics on '{it.1}' at pos {it.2}: {err}" - -end DocGen4.Bibtex diff --git a/DocGen4/Output/Bibtex/TexDiacritics.lean b/DocGen4/Output/Bibtex/TexDiacritics.lean deleted file mode 100644 index 3824362f..00000000 --- a/DocGen4/Output/Bibtex/TexDiacritics.lean +++ /dev/null @@ -1,200 +0,0 @@ -import Lean.Data.Parsec - -/-! - -This file contains functions for TeX diacritics process. -The main function is `texDiacritics`, which -will convert all TeX commands for diacritics into UTF-8 characters, -and error on any other TeX commands which are not in math environment. - --/ - -open Lean Parsec - -namespace DocGen4.Bibtex - -/-- Match a sequence of space characters and return it. -/ -def ws' : Parsec String := manyChars <| satisfy fun c => - c == ' ' || c == '\n' || c == '\r' || c == '\t' - -/-- Match a normal characters which is not the beginning of TeX command. -/ -def normalChar : Parsec Char := satisfy fun c => - c != '\\' && c != '$' && c != '{' && c != '}' - -/-- Match at least one normal characters which is not the beginning of TeX command. -/ -def normalChars : Parsec String := many1Chars normalChar - -/-- Match a TeX command starting with `\`, potentially with trailing whitespaces. -/ -def texCommand : Parsec String := pchar '\\' *> attempt do - let s ← manyChars asciiLetter - if s.isEmpty then - return "\\" ++ toString (← anyChar) ++ (← ws') - else - match ← peek? with - | .some c => - match c with - | '*' => - skip - return "\\" ++ s ++ toString c ++ (← ws') - | _ => - return "\\" ++ s ++ (← ws') - | .none => - return "\\" ++ s - -/-- Similar to `texCommand` but it excludes some commands. -/ -def texCommand' (exclude : Array String) : Parsec String := attempt do - let s ← texCommand - match exclude.find? (· == s.trim) with - | .some _ => fail s!"'{s.trim}' is not allowed" - | .none => return s - -/-- Match a sequence starting with `{` and ending with `}`. -/ -def bracedContent (p : Parsec String) : Parsec String := - pchar '{' *> (("{" ++ · ++ "}") <$> p) <* pchar '}' - -/-- Similar to `bracedContent` but it does not output braces. -/ -def bracedContent' (p : Parsec String) : Parsec String := - pchar '{' *> p <* pchar '}' - -partial def manyOptions {α} (p : Parsec (Option α)) (acc : Array α := #[]) : - Parsec (Array α) := fun it => - match p it with - | .success it ret => - match ret with - | .some ret => manyOptions p (acc.push ret) it - | .none => .success it acc - | .error it err => .error it err - -partial def mathContentAux2 : Parsec String := do - let doOne : Parsec (Option String) := fun it => - if it.hasNext then - match it.curr with - | '{' => (.some <$> bracedContent mathContentAux2) it - | '\\' => - match texCommand' #["\\(", "\\)", "\\[", "\\]"] it with - | .success it ret => .success it (.some ret) - | .error _ _ => .success it .none - | '}' | '$' => .success it .none - | _ => (.some <$> normalChars) it - else - .success it .none - return String.join (← manyOptions doOne).toList - -def mathContentAux (beginning ending dollar : String) : Parsec String := - pstring beginning *> ((dollar ++ · ++ dollar) <$> mathContentAux2) <* pstring ending - -/-- Match a math content. Returns `Option.none` if it does not start with `\(`, `\[` or `$`. -/ -def mathContent : Parsec (Option String) := fun it => - let substr := it.extract (it.forward 2) - if substr = "\\[" then - (.some <$> mathContentAux "\\[" "\\]" "$$") it - else if substr = "\\(" then - (.some <$> mathContentAux "\\(" "\\)" "$") it - else if substr = "$$" then - (.some <$> mathContentAux "$$" "$$" "$$") it - else if it.curr = '$' then - (.some <$> mathContentAux "$" "$" "$") it - else - .success it .none - -/-- Match a TeX command for diacritics, return the corresponding UTF-8 string. -Sometimes it needs to read the character after the command, -in this case the `p` is used to read braced content. -/ -def texDiacriticsCommand (p : Parsec String) : Parsec String := do - let cmd ← String.trim <$> texCommand - match cmd with - | "\\oe" => pure "œ" - | "\\OE" => pure "Œ" - | "\\ae" => pure "æ" - | "\\AE" => pure "Æ" - | "\\aa" => pure "å" - | "\\AA" => pure "Å" - | "\\o" => pure "ø" - | "\\O" => pure "Ø" - | "\\l" => pure "ł" - | "\\L" => pure "Ł" - | "\\i" => pure "ı" - | "\\j" => pure "ȷ" - | "\\ss" => pure "\u00DF" - | "\\SS" => pure "\u1E9E" - | "\\cprime" => pure "\u02B9" - | _ => - let ch : String := match cmd with - | "\\`" => "\u0300" - | "\\'" => "\u0301" - | "\\^" => "\u0302" - | "\\\"" => "\u0308" - | "\\~" => "\u0303" - | "\\=" => "\u0304" - | "\\." => "\u0307" - | "\\u" => "\u0306" - | "\\v" => "\u030C" - | "\\H" => "\u030B" - | "\\t" => "\u0361" - | "\\c" => "\u0327" - | "\\d" => "\u0323" - | "\\b" => "\u0331" - | "\\k" => "\u0328" - | _ => "" - if ch.isEmpty then - fail s!"unsupported command: '{cmd}'" - else - let doOne : Parsec String := fun it => - if it.hasNext then - match it.curr with - | '{' => bracedContent p it - | _ => normalChars it - else - .error it "character expected" - let s ← doOne - if s.startsWith "{" then - if s.length < 3 then - fail s!"expected string of length at least 3, but got '{s}'" - else - return s.take 2 ++ ch ++ s.drop 2 - else - if s.isEmpty then - fail "expected a non-empty string" - else - return s.take 1 ++ ch ++ s.drop 1 - -/-- Convert all TeX commands for diacritics into UTF-8 characters, -and error on any other TeX commands which are not in math environment. -/ -partial def texDiacritics : Parsec String := do - let doOne : Parsec (Option String) := fun it => - if it.hasNext then - match mathContent it with - | .success it ret => - match ret with - | .some ret => .success it (.some ret) - | .none => - match it.curr with - | '{' => (.some <$> bracedContent texDiacritics) it - | '\\' => (.some <$> texDiacriticsCommand texDiacritics) it - | '}' => .success it .none - | _ => (.some <$> normalChars) it - | .error it err => .error it err - else - .success it .none - return String.join (← manyOptions doOne).toList - -/-- Remove all braces except for those in math environment, -and error on any TeX commands which are not in math environment. -/ -partial def removeBraces : Parsec String := do - let doOne : Parsec (Option String) := fun it => - if it.hasNext then - match mathContent it with - | .success it ret => - match ret with - | .some ret => .success it (.some ret) - | .none => - match it.curr with - | '{' => (.some <$> bracedContent' removeBraces) it - | '}' => .success it .none - | _ => (.some <$> normalChars) it - | .error it err => .error it err - else - .success it .none - return String.join (← manyOptions doOne).toList - -end DocGen4.Bibtex diff --git a/lake-manifest.json b/lake-manifest.json index 323362fa..659d6fc3 100644 --- a/lake-manifest.json +++ b/lake-manifest.json @@ -15,7 +15,7 @@ "type": "git", "subDir": null, "scope": "", - "rev": "d4e2c1692c3bf1898a71a278f3441b09d8414225", + "rev": "d1229add3b208d1ac3051cd090a209ba19817d04", "name": "BibtexQuery", "manifestFile": "lake-manifest.json", "inputRev": "master",