Skip to content

Commit

Permalink
Introduce PythonWhitespace to confine trim operations to Python whi…
Browse files Browse the repository at this point in the history
…tespace (#4994)

## Summary

We use `.trim()` and friends in a bunch of places, to strip whitespace
from source code. However, not all Unicode whitespace characters are
considered "whitespace" in Python, which only supports the standard
space, tab, and form-feed characters.

This PR audits our usages of `.trim()`, `.trim_start()`, `.trim_end()`,
and `char::is_whitespace`, and replaces them as appropriate with a new
`.trim_whitespace()` analogues, powered by a `PythonWhitespace` trait.

In general, the only place that should continue to use `.trim()` is
content within docstrings, which don't need to adhere to Python's
semantic definitions of whitespace.

Closes #4991.
  • Loading branch information
charliermarsh authored Jun 10, 2023
1 parent c1ac500 commit f401050
Show file tree
Hide file tree
Showing 14 changed files with 64 additions and 32 deletions.
8 changes: 4 additions & 4 deletions crates/ruff/src/autofix/edits.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use rustpython_parser::{lexer, Mode, Tok};
use ruff_diagnostics::Edit;
use ruff_python_ast::helpers;
use ruff_python_ast::source_code::{Indexer, Locator, Stylist};
use ruff_python_whitespace::NewlineWithTrailingNewline;
use ruff_python_whitespace::{is_python_whitespace, NewlineWithTrailingNewline, PythonWhitespace};

use crate::autofix::codemods;

Expand Down Expand Up @@ -242,7 +242,7 @@ fn trailing_semicolon(stmt: &Stmt, locator: &Locator) -> Option<TextSize> {
let contents = locator.after(stmt.end());

for line in NewlineWithTrailingNewline::from(contents) {
let trimmed = line.trim_start();
let trimmed = line.trim_whitespace_start();

if trimmed.starts_with(';') {
let colon_offset = line.text_len() - trimmed.text_len();
Expand All @@ -262,7 +262,7 @@ fn next_stmt_break(semicolon: TextSize, locator: &Locator) -> TextSize {

let contents = &locator.contents()[usize::from(start_location)..];
for line in NewlineWithTrailingNewline::from(contents) {
let trimmed = line.trim();
let trimmed = line.trim_whitespace();
// Skip past any continuations.
if trimmed.starts_with('\\') {
continue;
Expand All @@ -276,7 +276,7 @@ fn next_stmt_break(semicolon: TextSize, locator: &Locator) -> TextSize {
} else {
// Otherwise, find the start of the next statement. (Or, anything that isn't
// whitespace.)
let relative_offset = line.find(|c: char| !c.is_whitespace()).unwrap();
let relative_offset = line.find(|c: char| !is_python_whitespace(c)).unwrap();
line.start() + TextSize::try_from(relative_offset).unwrap()
};
}
Expand Down
4 changes: 2 additions & 2 deletions crates/ruff/src/importer/insertion.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use rustpython_parser::{lexer, Mode, Tok};
use ruff_diagnostics::Edit;
use ruff_python_ast::helpers::is_docstring_stmt;
use ruff_python_ast::source_code::{Locator, Stylist};
use ruff_python_whitespace::UniversalNewlineIterator;
use ruff_python_whitespace::{PythonWhitespace, UniversalNewlineIterator};
use ruff_textwrap::indent;

#[derive(Debug, Clone, PartialEq, Eq)]
Expand Down Expand Up @@ -69,7 +69,7 @@ impl<'a> Insertion<'a> {

// Skip over commented lines.
for line in UniversalNewlineIterator::with_offset(locator.after(location), location) {
if line.trim_start().starts_with('#') {
if line.trim_whitespace_start().starts_with('#') {
location = line.full_end();
} else {
break;
Expand Down
4 changes: 2 additions & 2 deletions crates/ruff/src/noqa.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use ruff_text_size::{TextLen, TextRange, TextSize};

use ruff_diagnostics::Diagnostic;
use ruff_python_ast::source_code::Locator;
use ruff_python_whitespace::LineEnding;
use ruff_python_whitespace::{LineEnding, PythonWhitespace};

use crate::codes::NoqaCode;
use crate::registry::{AsRule, Rule, RuleSet};
Expand Down Expand Up @@ -87,7 +87,7 @@ enum ParsedExemption<'a> {

/// Return a [`ParsedExemption`] for a given comment line.
fn parse_file_exemption(line: &str) -> ParsedExemption {
let line = line.trim_start();
let line = line.trim_whitespace_start();

if line.starts_with("# flake8: noqa")
|| line.starts_with("# flake8: NOQA")
Expand Down
3 changes: 2 additions & 1 deletion crates/ruff/src/rules/flake8_pytest_style/rules/helpers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ use rustpython_parser::ast::{self, Constant, Decorator, Expr, Keyword};
use ruff_python_ast::call_path::{collect_call_path, CallPath};
use ruff_python_ast::helpers::map_callable;
use ruff_python_semantic::model::SemanticModel;
use ruff_python_whitespace::PythonWhitespace;

pub(super) fn get_mark_decorators(
decorators: &[Decorator],
Expand Down Expand Up @@ -81,7 +82,7 @@ pub(super) fn split_names(names: &str) -> Vec<&str> {
names
.split(',')
.filter_map(|s| {
let trimmed = s.trim();
let trimmed = s.trim_whitespace();
if trimmed.is_empty() {
None
} else {
Expand Down
3 changes: 2 additions & 1 deletion crates/ruff/src/rules/flake8_simplify/rules/fix_if.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use crate::autofix::codemods::CodegenStylist;
use ruff_diagnostics::Edit;
use ruff_python_ast::source_code::{Locator, Stylist};
use ruff_python_ast::whitespace;
use ruff_python_whitespace::PythonWhitespace;

use crate::cst::matchers::{match_function_def, match_if, match_indented_block, match_statement};

Expand Down Expand Up @@ -44,7 +45,7 @@ pub(crate) fn fix_nested_if_statements(
let contents = locator.lines(stmt.range());

// Handle `elif` blocks differently; detect them upfront.
let is_elif = contents.trim_start().starts_with("elif");
let is_elif = contents.trim_whitespace_start().starts_with("elif");

// If this is an `elif`, we have to remove the `elif` keyword for now. (We'll
// restore the `el` later on.)
Expand Down
4 changes: 2 additions & 2 deletions crates/ruff/src/rules/isort/helpers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use rustpython_parser::ast::{Ranged, Stmt};
use rustpython_parser::{lexer, Mode, Tok};

use ruff_python_ast::source_code::Locator;
use ruff_python_whitespace::UniversalNewlines;
use ruff_python_whitespace::{PythonWhitespace, UniversalNewlines};

use crate::rules::isort::types::TrailingComma;

Expand Down Expand Up @@ -63,7 +63,7 @@ pub(super) fn has_comment_break(stmt: &Stmt, locator: &Locator) -> bool {
// def f(): pass
let mut seen_blank = false;
for line in locator.up_to(stmt.start()).universal_newlines().rev() {
let line = line.trim();
let line = line.trim_whitespace();
if seen_blank {
if line.starts_with('#') {
return true;
Expand Down
6 changes: 4 additions & 2 deletions crates/ruff/src/rules/isort/rules/organize_imports.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use ruff_python_ast::helpers::{
followed_by_multi_statement_line, preceded_by_multi_statement_line, trailing_lines_end,
};
use ruff_python_ast::source_code::{Indexer, Locator, Stylist};
use ruff_python_whitespace::{leading_indentation, UniversalNewlines};
use ruff_python_whitespace::{leading_indentation, PythonWhitespace, UniversalNewlines};
use ruff_textwrap::indent;

use crate::line_width::LineWidth;
Expand Down Expand Up @@ -72,7 +72,9 @@ fn matches_ignoring_indentation(val1: &str, val2: &str) -> bool {
val1.universal_newlines()
.zip_longest(val2.universal_newlines())
.all(|pair| match pair {
EitherOrBoth::Both(line1, line2) => line1.trim_start() == line2.trim_start(),
EitherOrBoth::Both(line1, line2) => {
line1.trim_whitespace_start() == line2.trim_whitespace_start()
}
_ => false,
})
}
Expand Down
5 changes: 3 additions & 2 deletions crates/ruff/src/rules/pycodestyle/rules/logical_lines/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ pub(crate) use missing_whitespace_around_operator::{
MissingWhitespaceAroundBitwiseOrShiftOperator, MissingWhitespaceAroundModuloOperator,
MissingWhitespaceAroundOperator,
};
use ruff_python_whitespace::is_python_whitespace;
pub(crate) use space_around_operator::{
space_around_operator, MultipleSpacesAfterOperator, MultipleSpacesBeforeOperator,
TabAfterOperator, TabBeforeOperator,
Expand Down Expand Up @@ -378,7 +379,7 @@ impl Whitespace {
len += c.text_len();
} else if matches!(c, '\n' | '\r') {
break;
} else if c.is_whitespace() {
} else if is_python_whitespace(c) {
count += 1;
len += c.text_len();
} else {
Expand Down Expand Up @@ -409,7 +410,7 @@ impl Whitespace {
} else if matches!(c, '\n' | '\r') {
// Indent
return (Self::None, TextSize::default());
} else if c.is_whitespace() {
} else if is_python_whitespace(c) {
count += 1;
len += c.text_len();
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use ruff_diagnostics::Violation;
use ruff_macros::{derive_message_formats, violation};
use ruff_python_ast::source_code::Locator;
use ruff_python_ast::token_kind::TokenKind;
use ruff_python_whitespace::PythonWhitespace;

use crate::checkers::logical_lines::LogicalLinesContext;
use crate::rules::pycodestyle::rules::logical_lines::LogicalLine;
Expand Down Expand Up @@ -156,7 +157,7 @@ pub(crate) fn whitespace_before_comment(
));
let token_text = locator.slice(range);

let is_inline_comment = !line_text.trim().is_empty();
let is_inline_comment = !line_text.trim_whitespace().is_empty();
if is_inline_comment {
if range.start() - prev_end < " ".text_len() {
context.push(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use rustpython_parser::ast::Ranged;
use ruff_diagnostics::{AlwaysAutofixableViolation, Diagnostic, Edit, Fix};
use ruff_macros::{derive_message_formats, violation};
use ruff_python_semantic::definition::{Definition, Member, MemberKind};
use ruff_python_whitespace::{UniversalNewlineIterator, UniversalNewlines};
use ruff_python_whitespace::{PythonWhitespace, UniversalNewlineIterator, UniversalNewlines};

use crate::checkers::ast::Checker;
use crate::docstrings::Docstring;
Expand Down Expand Up @@ -133,10 +133,9 @@ pub(crate) fn blank_before_after_class(checker: &mut Checker, docstring: &Docstr
.locator
.slice(TextRange::new(docstring.end(), stmt.end()));

let all_blank_after = after
.universal_newlines()
.skip(1)
.all(|line| line.trim().is_empty() || line.trim_start().starts_with('#'));
let all_blank_after = after.universal_newlines().skip(1).all(|line| {
line.trim_whitespace().is_empty() || line.trim_whitespace_start().starts_with('#')
});
if all_blank_after {
return;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use rustpython_parser::ast::Ranged;
use ruff_diagnostics::{AlwaysAutofixableViolation, Diagnostic, Edit, Fix};
use ruff_macros::{derive_message_formats, violation};
use ruff_python_semantic::definition::{Definition, Member, MemberKind};
use ruff_python_whitespace::{UniversalNewlineIterator, UniversalNewlines};
use ruff_python_whitespace::{PythonWhitespace, UniversalNewlineIterator, UniversalNewlines};

use crate::checkers::ast::Checker;
use crate::docstrings::Docstring;
Expand Down Expand Up @@ -102,10 +102,9 @@ pub(crate) fn blank_before_after_function(checker: &mut Checker, docstring: &Doc
.slice(TextRange::new(docstring.end(), stmt.end()));

// If the docstring is only followed by blank and commented lines, abort.
let all_blank_after = after
.universal_newlines()
.skip(1)
.all(|line| line.trim().is_empty() || line.trim_start().starts_with('#'));
let all_blank_after = after.universal_newlines().skip(1).all(|line| {
line.trim_whitespace().is_empty() || line.trim_whitespace_start().starts_with('#')
});
if all_blank_after {
return;
}
Expand All @@ -129,7 +128,7 @@ pub(crate) fn blank_before_after_function(checker: &mut Checker, docstring: &Doc
// Avoid violations for blank lines followed by inner functions or classes.
if blank_lines_after == 1
&& lines
.find(|line| !line.trim_start().starts_with('#'))
.find(|line| !line.trim_whitespace_start().starts_with('#'))
.map_or(false, |line| INNER_FUNCTION_OR_CLASS_REGEX.is_match(&line))
{
return;
Expand Down
4 changes: 2 additions & 2 deletions crates/ruff_python_ast/src/helpers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use std::path::Path;
use itertools::Itertools;
use log::error;
use num_traits::Zero;
use ruff_python_whitespace::UniversalNewlineIterator;
use ruff_python_whitespace::{PythonWhitespace, UniversalNewlineIterator};
use ruff_text_size::{TextRange, TextSize};
use rustc_hash::{FxHashMap, FxHashSet};
use rustpython_parser::ast::{
Expand Down Expand Up @@ -1031,7 +1031,7 @@ pub fn trailing_lines_end(stmt: &Stmt, locator: &Locator) -> TextSize {
let rest = &locator.contents()[usize::from(line_end)..];

UniversalNewlineIterator::with_offset(rest, line_end)
.take_while(|line| line.trim().is_empty())
.take_while(|line| line.trim_whitespace().is_empty())
.last()
.map_or(line_end, |l| l.full_end())
}
Expand Down
4 changes: 2 additions & 2 deletions crates/ruff_python_formatter/src/comments/placement.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use crate::trivia::{SimpleTokenizer, TokenKind};
use ruff_python_ast::node::AnyNodeRef;
use ruff_python_ast::source_code::Locator;
use ruff_python_ast::whitespace;
use ruff_python_whitespace::UniversalNewlines;
use ruff_python_whitespace::{PythonWhitespace, UniversalNewlines};
use ruff_text_size::{TextRange, TextSize};
use rustpython_parser::ast::Ranged;
use std::cmp::Ordering;
Expand Down Expand Up @@ -986,7 +986,7 @@ fn max_empty_lines(contents: &str) -> usize {
let mut max_empty_lines = 0;

for line in contents.universal_newlines().skip(1) {
if line.trim().is_empty() {
if line.trim_whitespace().is_empty() {
empty_lines += 1;
} else {
max_empty_lines = max_empty_lines.max(empty_lines);
Expand Down
28 changes: 28 additions & 0 deletions crates/ruff_python_whitespace/src/whitespace.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,31 @@ pub fn leading_indentation(line: &str) -> &str {
line.find(|char: char| !is_python_whitespace(char))
.map_or(line, |index| &line[..index])
}

pub trait PythonWhitespace {
/// Like `str::trim()`, but only removes whitespace characters that Python considers
/// to be [whitespace](https://docs.python.org/3/reference/lexical_analysis.html#whitespace-between-tokens).
fn trim_whitespace(&self) -> &Self;

/// Like `str::trim_start()`, but only removes whitespace characters that Python considers
/// to be [whitespace](https://docs.python.org/3/reference/lexical_analysis.html#whitespace-between-tokens).
fn trim_whitespace_start(&self) -> &Self;

/// Like `str::trim_end()`, but only removes whitespace characters that Python considers
/// to be [whitespace](https://docs.python.org/3/reference/lexical_analysis.html#whitespace-between-tokens).
fn trim_whitespace_end(&self) -> &Self;
}

impl PythonWhitespace for str {
fn trim_whitespace(&self) -> &Self {
self.trim_matches(is_python_whitespace)
}

fn trim_whitespace_start(&self) -> &Self {
self.trim_start_matches(is_python_whitespace)
}

fn trim_whitespace_end(&self) -> &Self {
self.trim_end_matches(is_python_whitespace)
}
}

0 comments on commit f401050

Please sign in to comment.