Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: [sc-24365] ens-normalize add "ignored" characters to disallowed sequence #61

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 59 additions & 8 deletions ens_normalize/normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -558,7 +558,10 @@ def post_check_empty(name: str, input: str) -> Optional[CurableSequence]:
# fully ignorable name
return CurableSequence(
CurableSequenceType.EMPTY_LABEL,
index=0,
# We set the index to -1 to let offset_err_start()
# know that this is the special empty name case.
# Otherwise, it would offset the index past the ignored characters.
index=-1,
sequence=input,
suggested='',
)
Expand All @@ -581,7 +584,7 @@ def post_check_empty(name: str, input: str) -> Optional[CurableSequence]:
return CurableSequence(
CurableSequenceType.EMPTY_LABEL,
index=i,
sequence='..',
sequence='..', # !!
suggested='.',
)

Expand All @@ -598,7 +601,7 @@ def post_check_underscore(label: str) -> Optional[CurableSequence]:
return CurableSequence(
CurableSequenceType.UNDERSCORE,
index=i,
sequence='_' * cnt,
sequence='_' * cnt, # !!
suggested='',
)

Expand All @@ -608,7 +611,7 @@ def post_check_hyphen(label: str) -> Optional[CurableSequence]:
return CurableSequence(
CurableSequenceType.HYPHEN,
index=2,
sequence='--',
sequence='--', # !!
suggested='',
)

Expand Down Expand Up @@ -648,7 +651,7 @@ def make_fenced_error(cps: List[int], start: int, end: int) -> CurableSequence:
return CurableSequence(
type_,
index=start,
sequence=''.join(map(chr, cps[start:end])),
sequence=''.join(map(chr, cps[start:end])), # !!
suggested=suggested,
)

Expand Down Expand Up @@ -1057,7 +1060,7 @@ def ens_process(
label_is_greek = []
error = post_check(emojis_as_fe0f, label_is_greek, input)
if isinstance(error, CurableSequence): # or NormalizableSequence because of inheritance
offset_err_start(error, tokens)
offset_err_start(error, tokens, input)

# else:
# only the result of post_check() is not input aligned
Expand Down Expand Up @@ -1092,17 +1095,64 @@ def ens_process(
)


def offset_err_start(err: Optional[CurableSequence], tokens: List[Token]):
def restore_ignored_in_sequence(seq: str, input: str) -> str:
"""
Restore any ignored characters from the input string into the sequence.

Args:
seq: The sequence to restore ignored characters into
input: The input string that may contain ignored characters

Returns:
The sequence with ignored characters restored
"""
if not seq:
return seq

seq_out = []
input_i = 0
seq_len = len(seq)
matched = 0

# Keep going until we've matched all characters in seq
while matched < seq_len and input_i < len(input):
# For mapped characters, we need to check if the current input char
# maps to our target sequence char
input_cp = ord(input[input_i])
mapped_cps = NORMALIZATION.mapped.get(input_cp, [input_cp])
target_cp = ord(seq[matched])

if input_cp == target_cp or target_cp in mapped_cps:
seq_out.append(input[input_i])
matched += 1
elif matched > 0:
# If we've started matching but hit a non-match,
# include ignored characters between matches
seq_out.append(input[input_i])
input_i += 1

# If we didn't match everything, use the original sequence
if matched < seq_len:
return seq

return ''.join(seq_out)


def offset_err_start(err: Optional[CurableSequence], tokens: List[Token], input: str):
"""
Output of post_check() is not input aligned.
This function offsets the error index (in-place) to match the input characters.
"""
if err.index < 0:
# empty name case
err.index = 0
return
# index in string that was scanned
i = 0
# offset between input and scanned
offset = 0
for tok in tokens:
if i >= err.index:
if i > err.index:
# everything before the error is aligned
break
if tok.type in (TY_IGNORED, TY_DISALLOWED):
Expand All @@ -1127,6 +1177,7 @@ def offset_err_start(err: Optional[CurableSequence], tokens: List[Token]):
# input: cps, scanned: cps
i += len(tok.cps)
err.index += offset
err.sequence = restore_ignored_in_sequence(err.sequence, input[err.index :])


def ens_normalize(text: str) -> str:
Expand Down
41 changes: 41 additions & 0 deletions tests/test_normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -532,3 +532,44 @@ def test_simple_name_optimization():
assert len(r.cures) == 0
assert r.error is None
assert r.normalizations is None


@pytest.mark.parametrize(
'input_str, expected_code, expected_index, expected_sequence, expected_suggested',
[
('nick.\ufe0f\ufe0f.eth', 'EMPTY_LABEL', 4, '.\ufe0f\ufe0f.', '.'),
('01\ufe0f--345', 'HYPHEN', 3, '--', ''),
('01-\ufe0f-345', 'HYPHEN', 2, '-\ufe0f-', ''),
("\ufe0f'b", 'FENCED_LEADING', 1, "'", ''),
],
)
def test_suggestions_with_ignored(input_str, expected_code, expected_index, expected_sequence, expected_suggested):
e = ens_process(input_str).error
assert e.code == expected_code
assert e.index == expected_index
assert e.sequence == expected_sequence
assert e.suggested == expected_suggested


@pytest.mark.parametrize(
'input_str, expected_type, expected_index, expected_sequence, expected_suggested',
[
# Test mapped characters with ignored characters
('aA\ufe0fA', NormalizableSequenceType.MAPPED, 1, 'A', 'a'), # Single capital A gets mapped
('aAB', NormalizableSequenceType.MAPPED, 1, 'A', 'a'), # First capital gets mapped
# Test FE0F normalization
('a🚴‍♂️', NormalizableSequenceType.FE0F, 1, '🚴‍♂️', '🚴‍♂'), # FE0F in emoji
# Test ignored characters
('a\u00ad', NormalizableSequenceType.IGNORED, 1, '\u00ad', ''), # Soft hyphen is ignored
# Test FE0F as ignored
('a\ufe0f', NormalizableSequenceType.IGNORED, 1, '\ufe0f', ''), # FE0F by itself is ignored
],
)
def test_normalizations_with_ignored(input_str, expected_type, expected_index, expected_sequence, expected_suggested):
normalizations = ens_normalizations(input_str)
assert len(normalizations) > 0
e = normalizations[0] # Get first normalization
assert e.type == expected_type
assert e.index == expected_index
assert e.sequence == expected_sequence
assert e.suggested == expected_suggested
Loading