leanprover · david-christiansen · Feb 20, 2024 · Feb 10, 2024 · Feb 10, 2024 · Feb 10, 2024
@@ -290,73 +290,135 @@ where go (acc : String) (s : String) : List String → String
   | a :: as => go (acc ++ s ++ a) s as
   | []      => acc
 
-/-- Iterator for `String`. That is, a `String` and a position in that string. -/
+/-- Iterator over the characters (`Char`) of a `String`.
+
+Typically created by `s.iter`, where `s` is a `String`.
+
+An iterator is *valid* if the position `i` is *valid* for the string `s`, meaning `0 ≤ i ≤ s.endPos`
+and `i` lies on a UTF8 byte boundary consistently with [std's `String.Pos.Valid`][valid]. If `i =
+s.endPos`, the iterator is *at the end (of `s`)*.
+
+Most operations on iterators return arbitrary values if the iterator is not valid. The functions in
+the `String.Iterator` API should prevent you from dealing with invalid iterators, with two
+exceptions:
+
+- `Iterator.next iter` is invalid if `iter` is already at the end of the string (`iter.atEnd` is
+  true), and
+- `Iterator.forward iter n`/`Iterator.nextn iter n` is invalid if `n` is strictly greater than the
+  number of remaining characters.
+
+[valid]: https://leanprover-community.github.io/mathlib4_docs/Std/Data/String/Lemmas.html#String.Pos.Valid
-[valid]: https://leanprover-community.github.io/mathlib4_docs/Std/Data/String/Lemmas.html#String.Pos.Valid
-[valid]: https://leanprover-community.github.io/mathlib4_docs/Std/Data/String/Lemmas.html#String.Pos.Valid
+-/
 structure Iterator where
+  /-- The string the iterator is for. -/
   s : String
+  /-- The current position.
+
+  This position is not necessarily legal for the string, for instance if one keeps calling
+  `Iterator.next` when `Iterator.atEnd` is true. If the position is not a legal one, then the
+  current character is `(default : Char)`, similar to `String.get` on an illegal position. -/
-  This position is not necessarily legal for the string, for instance if one keeps calling
-  `Iterator.next` when `Iterator.atEnd` is true. If the position is not a legal one, then the
-  current character is `(default : Char)`, similar to `String.get` on an illegal position. -/
+  This position is not necessarily legal for the string, for instance if one keeps calling
+  `Iterator.next` when `Iterator.atEnd` is true. If the position is not valid, then the
+  current character is `(default : Char)`, similar to `String.get` on an invalid position. -/
-  This position is not necessarily legal for the string, for instance if one keeps calling
-  `Iterator.next` when `Iterator.atEnd` is true. If the position is not a legal one, then the
-  current character is `(default : Char)`, similar to `String.get` on an illegal position. -/
+An iterator is *valid* if the position `i` is valid for the string `s`. Most operations
+on iterators return garbage if the iterator is not valid. If you use the functions in the API here,
+you should not have to deal with invalid iterators, with the notable exception of `Iterator.next`,
+which produces an invalid iterator if the iterator is already at the end of the string. -/
-  This position is not necessarily legal for the string, for instance if one keeps calling
-  `Iterator.next` when `Iterator.atEnd` is true. If the position is not a legal one, then the
-  current character is `(default : Char)`, similar to `String.get` on an illegal position. -/
+  This position is not necessarily legal for the string, for instance if one keeps calling
+  `Iterator.next` when `Iterator.atEnd` is true. If the position is not valid, then the
+  current character is `(default : Char)`, similar to `String.get` on an invalid position. -/
-  This position is not necessarily legal for the string, for instance if one keeps calling
-  `Iterator.next` when `Iterator.atEnd` is true. If the position is not a legal one, then the
-  current character is `(default : Char)`, similar to `String.get` on an illegal position. -/
+An iterator is *valid* if the position `i` is valid for the string `s`. Most operations
+on iterators return garbage if the iterator is not valid. If you use the functions in the API here,
+you should not have to deal with invalid iterators, with the notable exception of `Iterator.next`,
+which produces an invalid iterator if the iterator is already at the end of the string. -/
   i : Pos
   deriving DecidableEq
 
+/-- Creates an iterator at the beginning of a string. -/
 def mkIterator (s : String) : Iterator :=
   ⟨s, 0⟩
 
+@[inherit_doc mkIterator]
 abbrev iter := mkIterator
 
+/-- The size of a string iterator is the number of bytes remaining. -/
 instance : SizeOf String.Iterator where
   sizeOf i := i.1.utf8ByteSize - i.2.byteIdx
 
 theorem Iterator.sizeOf_eq (i : String.Iterator) : sizeOf i = i.1.utf8ByteSize - i.2.byteIdx :=
   rfl
 
 namespace Iterator
-def toString : Iterator → String
-  | ⟨s, _⟩ => s
+@[inherit_doc Iterator.s]
+def toString := Iterator.s
 
+/-- Number of bytes remaining in the iterator. -/
 def remainingBytes : Iterator → Nat
   | ⟨s, i⟩ => s.endPos.byteIdx - i.byteIdx
 
-def pos : Iterator → Pos
-  | ⟨_, i⟩ => i
+@[inherit_doc Iterator.i]
+def pos := Iterator.i
+
+/-- The character at the current position.
 
+On an invalid position, returns `(default : Char)`. -/
 def curr : Iterator → Char
   | ⟨s, i⟩ => get s i
 
+/-- Moves the iterator's position forward by one character, unconditionally.
+
+It is only valid to call this function if the iterator is not a the end of the string, *i.e.*
+`Iterator.atEnd` is false; otherwise, the resulting iterator will be invalid. -/
 def next : Iterator → Iterator
   | ⟨s, i⟩ => ⟨s, s.next i⟩
 
+/-- Decreases the iterator's position.
+
+If the position is zero, this function is the identity. -/
 def prev : Iterator → Iterator
   | ⟨s, i⟩ => ⟨s, s.prev i⟩
 
+/-- True if the iterator is past the string's last character. -/
 def atEnd : Iterator → Bool
   | ⟨s, i⟩ => i.byteIdx ≥ s.endPos.byteIdx
 
+/-- True if the iterator is not past the string's last character. -/
 def hasNext : Iterator → Bool
   | ⟨s, i⟩ => i.byteIdx < s.endPos.byteIdx
 
+/-- True if the position is not zero. -/
 def hasPrev : Iterator → Bool
   | ⟨_, i⟩ => i.byteIdx > 0
 
+/-- Replaces the current character in the string.
+
+Does nothing if the iterator is at the end of the string. If the iterator contains the only
+reference to its string, this function will mutate the string in-place instead of allocating a new
+one. -/
 def setCurr : Iterator → Char → Iterator
   | ⟨s, i⟩, c => ⟨s.set i c, i⟩
 
+/-- Moves the iterator's position to the end of the string.
+
+Note that `i.toEnd.atEnd` is always true. -/
 def toEnd : Iterator → Iterator
   | ⟨s, _⟩ => ⟨s, s.endPos⟩
 
+/-- Extracts the substring between the positions of two iterators.
+
+Returns the empty string if the iterators are for different strings, or if the position of the first
+iterator is past the position of the second iterator. -/
 def extract : Iterator → Iterator → String
   | ⟨s₁, b⟩, ⟨s₂, e⟩ =>
     if s₁ ≠ s₂ || b > e then ""
     else s₁.extract b e
 
+/-- Moves the iterator's position several characters forward.
+
+Calling this function is only legal if the number of character to skip is less than or equal to the
+number of characters left in the iterator. -/
 def forward : Iterator → Nat → Iterator
   | it, 0   => it
   | it, n+1 => forward it.next n
 
+/-- The remaining characters in an iterator, as a string. -/
 def remainingToString : Iterator → String
   | ⟨s, i⟩ => s.extract i s.endPos
 
+@[inherit_doc forward]
 def nextn : Iterator → Nat → Iterator
   | it, 0   => it
   | it, i+1 => nextn it.next i
 
+/-- Moves the iterator's position several characters back.
+
+If asked to go back more characters than available, stops at the beginning of the string. -/
 def prevn : Iterator → Nat → Iterator
   | it, 0   => it
   | it, i+1 => prevn it.prev i

diff --git a/src/Init/Prelude.lean b/src/Init/Prelude.lean
@@ -2373,6 +2373,12 @@ Codepoint positions (counting the Unicode codepoints rather than bytes)
 are represented by plain `Nat`s instead.
 Indexing a `String` by a byte position is constant-time, while codepoint
 positions need to be translated internally to byte positions in linear-time.
+
+A byte position `p` is *valid* for a string `s` if `0 ≤ p ≤ s.endPos` and `p`
+lies on a UTF8 byte boundary. This notion is properly introduced in [std's
+`String.Pos.Valid`][valid].
+
+[valid]: https://leanprover-community.github.io/mathlib4_docs/Std/Data/String/Lemmas.html#String.Pos.Valid
-lies on a UTF8 byte boundary. This notion is properly introduced in [std's
-`String.Pos.Valid`][valid].
-
-[valid]: https://leanprover-community.github.io/mathlib4_docs/Std/Data/String/Lemmas.html#String.Pos.Valid
+lies on a UTF8 byte boundary.
-lies on a UTF8 byte boundary. This notion is properly introduced in [std's
-`String.Pos.Valid`][valid].
-
-[valid]: https://leanprover-community.github.io/mathlib4_docs/Std/Data/String/Lemmas.html#String.Pos.Valid
+lies on a UTF8 byte boundary.
 -/
 structure String.Pos where
   /-- Get the underlying byte index of a `String.Pos` -/