Merge pull request #45 from cdesaintguilhem/pr-handle-sections

Split sectioning commands onto new lines
WGUNDERWOOD · Oct 31, 2024 · ffa7e38 · ffa7e38
2 parents 2a7b1d4 + 8453f71
commit ffa7e38
Show file tree

Hide file tree

Showing 6 changed files with 100 additions and 18 deletions.
diff --git a/src/format.rs b/src/format.rs
@@ -4,6 +4,7 @@ use crate::cli::*;
 use crate::ignore::*;
 use crate::indent::*;
 use crate::logging::*;
+use crate::regexes::RE_SECTIONING;
 use crate::regexes::{ENV_BEGIN, ENV_END, ITEM};
 use crate::subs::*;
 use crate::verbatim::*;
@@ -193,6 +194,8 @@ pub struct Pattern {
     pub contains_env_end: bool,
     /// Whether an item pattern is present
     pub contains_item: bool,
+    /// Whether a sectioning pattern is present
+    pub contains_sectioning: bool,
 }
 
 impl Pattern {
@@ -202,6 +205,7 @@ impl Pattern {
             contains_env_begin: s.contains(ENV_BEGIN),
             contains_env_end: s.contains(ENV_END),
             contains_item: s.contains(ITEM),
+            contains_sectioning: RE_SECTIONING.is_match(s),
         }
     }
 }

diff --git a/src/regexes.rs b/src/regexes.rs
@@ -30,8 +30,57 @@ const LISTS: [&str; 5] = [
 const VERBATIMS: [&str; 5] =
     ["verbatim", "Verbatim", "lstlisting", "minted", "comment"];
 
+/// Regex matches for non-sectioning commands that should be on a new line.
+const REQUIRE_NEW_LINE: [&str; 3] = [
+    r"\\begin\{",
+    r"\\end\{",
+    r"\\item ", // The trailing space should remain here.
+];
+
+/// Regex matches for sectioning commands
+const SECTIONING_COMMANDS: [&str; 10] = [
+    r"\\part\{",
+    r"\\part\*\{",
+    r"\\chapter\{",
+    r"\\chapter\*\{",
+    r"\\section\{",
+    r"\\section\*\{",
+    r"\\subsection\{",
+    r"\\subsection\*\{",
+    r"\\subsubsection\{",
+    r"\\subsubsection\*\{",
+];
+
 // Regexes
 lazy_static! {
+    // A static `String` which is a valid regex to match any one of the
+    // [`SECTIONING_COMMANDS`].
+    pub static ref SECTIONING_OR_GROUP: String = [
+        "(",
+        SECTIONING_COMMANDS.join("|").as_str(),
+        ")"
+    ].concat();
+    // A Vec of string slices that combines sectioning commands with other
+    // commands that need a new line.
+    pub static ref SPLITTING_COMMANDS: Vec<&'static str> = {
+        let mut v = Vec::with_capacity(
+            REQUIRE_NEW_LINE.len() + SECTIONING_COMMANDS.len(),
+        );
+        for str in REQUIRE_NEW_LINE {
+            v.push(str);
+        }
+        for str in SECTIONING_COMMANDS {
+            v.push(str);
+        }
+        v
+    };
+    // A static `String` which is a valid regex to match any one of the
+    // [`SPLITTING_COMMANDS`].
+    pub static ref SPLITTING_OR_GROUP: String = [
+        "(",
+        SPLITTING_COMMANDS.join("|").as_str(),
+        ")"
+    ].concat();
     pub static ref RE_NEWLINES: Regex =
         Regex::new(&format!(r"{LINE_END}{LINE_END}({LINE_END})+")).unwrap();
     pub static ref RE_TRAIL: Regex =
@@ -52,22 +101,25 @@ lazy_static! {
         Regex::new(r"(?P<prev>\S.*?)(?P<env>\\end\{)").unwrap();
     pub static ref RE_ITEM_SHARED_LINE: Regex =
         Regex::new(r"(?P<prev>\S.*?)(?P<env>\\item)").unwrap();
+    // Regex that matches sectioning commands
+    pub static ref RE_SECTIONING: Regex = Regex::new(
+        SECTIONING_OR_GROUP.as_str()
+    )
+    .unwrap();
+    // Regex that matches sectioning commands with non-whitespace characters
+    // before it.
+    pub static ref RE_SECTION_SHARED_LINE: Regex = Regex::new(
+        [r"(\S.*?)", "(", SECTIONING_OR_GROUP.as_str(), ".*)"]
+        .concat().as_str()
+    )
+    .unwrap();
     // Regex that matches any splitting command with non-whitespace
-    // characters before it and catches the previous text in a group called
+    // characters before it, catches the previous text in a group called
     // "prev" and captures the command itself and the remaining text
     // in a group called "env".
-    pub static ref RE_ENV_ITEM_SHARED_LINE: Regex = Regex::new(
-        r"(?x)          # Enable extended mode
-        (?P<prev>\S.*?) # <prev>: captures any number of characters starting
-                        # with a non-whitespace character until the start
-                        # of the next group;
-        (?P<env>(       # <env>: captures any LaTeX command before which the
-                        # line should be split
-            \\begin\{   # start of environments
-            |\\end\{    # end of environments
-            |\\item )   # list items (note the space before the closing bracket)
-        .*)             # and any characters that follow the command
-        "
+    pub static ref RE_ENV_ITEM_SEC_SHARED_LINE: Regex = Regex::new(
+        [r"(?P<prev>\S.*?)", "(?P<env>", SPLITTING_OR_GROUP.as_str(), ".*)"]
+        .concat().as_str()
     )
     .unwrap();
 }
diff --git a/src/subs.rs b/src/subs.rs
@@ -30,18 +30,20 @@ pub fn needs_env_new_line(line: &str, pattern: &Pattern) -> bool {
     // Check if we should format this line and if we've matched an environment.
     let contains_splittable_env = (pattern.contains_env_begin
         || pattern.contains_env_end
-        || pattern.contains_item)
+        || pattern.contains_item
+        || pattern.contains_sectioning)
         && (RE_ENV_BEGIN_SHARED_LINE.is_match(line)
             || RE_ENV_END_SHARED_LINE.is_match(line)
-            || RE_ITEM_SHARED_LINE.is_match(line));
+            || RE_ITEM_SHARED_LINE.is_match(line)
+            || RE_SECTION_SHARED_LINE.is_match(line));
 
     // If we're not ignoring and we've matched an environment ...
     if contains_splittable_env {
         // ... return `true` if the comment index is `None`
         // (which implies the split point must be in text), otherwise
         // compare the index of the comment with the split point.
         find_comment_index(line).map_or(true, |comment_index| {
-            if RE_ENV_ITEM_SHARED_LINE
+            if RE_ENV_ITEM_SEC_SHARED_LINE
                 .captures(line)
                 .unwrap() // Matched split point so no panic.
                 .get(2)
@@ -74,7 +76,7 @@ pub fn put_env_new_line<'a>(
     args: &Cli,
     logs: &mut Vec<Log>,
 ) -> (&'a str, &'a str) {
-    let captures = RE_ENV_ITEM_SHARED_LINE.captures(line).unwrap();
+    let captures = RE_ENV_ITEM_SEC_SHARED_LINE.captures(line).unwrap();
 
     let (line, [prev, rest, _]) = captures.extract();
 

diff --git a/src/tests.rs b/src/tests.rs
@@ -89,7 +89,7 @@ fn test_short() {
         //"comments.tex",
         //"cv.tex",
         //"document.tex",
-        "environment_lines.tex",
+        // "environment_lines.tex",
         //"heavy_wrap.tex",
         //"higher_categories_thesis.bib",
         //"higher_categories_thesis.tex",
@@ -102,6 +102,7 @@ fn test_short() {
         //"puthesis.cls",
         //"quiver.sty",
         //"readme.tex",
+        "sections.tex",
         //"short_document.tex",
         //"tikz_network.sty",
         //"unicode.tex",

diff --git a/tests/source/sections.tex b/tests/source/sections.tex
@@ -0,0 +1,7 @@
+\section{Section test}
+
+Sectioning commands should be moved to their own line.\subsection{Result} Even if there are more than one.\subsection{Result 2}
+
+Also \section*{A} unnumbered sectioning commands \subsection*{B} should be split onto their own lines, even if there \subsubsection*{C} are more than one.
+
+All of this \part{D} should also hold \part*{E} for parts \chapter{F} and chapters \chapter*{G}.
diff --git a/tests/target/sections.tex b/tests/target/sections.tex
@@ -0,0 +1,16 @@
+\section{Section test}
+
+Sectioning commands should be moved to their own line.
+\subsection{Result} Even if there are more than one.
+\subsection{Result 2}
+
+Also
+\section*{A} unnumbered sectioning commands
+\subsection*{B} should be split onto their own lines, even if there
+\subsubsection*{C} are more than one.
+
+All of this
+\part{D} should also hold
+\part*{E} for parts
+\chapter{F} and chapters
+\chapter*{G}.