Init.Data.String.Extra

String.toNat! definition

(s : String) : Nat

Full source

/--
Interprets a string as the decimal representation of a natural number, returning it. Panics if the
string does not contain a decimal natural number.

A string can be interpreted as a decimal natural number if it is not empty and all the characters in
it are digits.

Use `String.isNat` to check whether `String.toNat!` would return a value. `String.toNat?` is a safer
alternative that returns `none` instead of panicking when the string is not a natural number.

Examples:
 * `"0".toNat! = 0`
 * `"5".toNat! = 5`
 * `"587".toNat! = 587`
-/
def toNat! (s : String) : Nat :=
  if s.isNat then
    s.foldl (fun n c => n*10 + (c.toNat - '0'.toNat)) 0
  else
    panic! "Nat expected"

String to natural number conversion (panicking)

Informal description

The function interprets a string $ s $ as the decimal representation of a natural number and returns the corresponding natural number. If the string is empty or contains any non-digit characters, the function raises a panic (exception). A string is considered a valid decimal natural number if it is non-empty and all its characters are ASCII digits (i.e., between '0' and '9' inclusive). The conversion is done by iterating through each character of the string, multiplying the accumulated value by 10, and adding the numeric value of the current character (obtained by subtracting the ASCII value of '0' from the character's ASCII value). Examples: - $ \text{toNat!}("0") = 0 $ - $ \text{toNat!}("5") = 5 $ - $ \text{toNat!}("587") = 587 $

String.utf8DecodeChar? definition

(a : ByteArray) (i : Nat) : Option Char

Full source

/--
Decodes the UTF-8 character sequence that starts at a given index in a byte array, or `none` if
index `i` is out of bounds or is not the start of a valid UTF-8 character.
-/
def utf8DecodeChar? (a : ByteArray) (i : Nat) : Option Char := do
  let c ← a[i]?
  if c &&& 0x80 == 0 then
    some ⟨c.toUInt32, .inl (Nat.lt_trans c.toBitVec.isLt (by decide))⟩
  else if c &&& 0xe0 == 0xc0 then
    let c1 ← a[i+1]?
    guard (c1 &&& 0xc0 == 0x80)
    let r := ((c &&& 0x1f).toUInt32 <<< 6) ||| (c1 &&& 0x3f).toUInt32
    guard (0x80 ≤ r)
    -- TODO: Prove h from the definition of r once we have the necessary lemmas
    if h : r < 0xd800 then some ⟨r, .inl ((UInt32.lt_ofNat_iff (by decide)).1 h)⟩ else none
  else if c &&& 0xf0 == 0xe0 then
    let c1 ← a[i+1]?
    let c2 ← a[i+2]?
    guard (c1 &&& 0xc0 == 0x80 && c2 &&& 0xc0 == 0x80)
    let r :=
      ((c &&& 0x0f).toUInt32 <<< 12) |||
      ((c1 &&& 0x3f).toUInt32 <<< 6) |||
      (c2 &&& 0x3f).toUInt32
    guard (0x800 ≤ r)
    -- TODO: Prove `r < 0x110000` from the definition of r once we have the necessary lemmas
    if h : r < 0xd800 ∨ 0xdfff < r ∧ r < 0x110000 then
      have :=
        match h with
        | .inl h => Or.inl ((UInt32.lt_ofNat_iff (by decide)).1 h)
        | .inr h => Or.inr ⟨(UInt32.ofNat_lt_iff (by decide)).1 h.left, (UInt32.lt_ofNat_iff (by decide)).1 h.right⟩
      some ⟨r, this⟩
    else
      none
  else if c &&& 0xf8 == 0xf0 then
    let c1 ← a[i+1]?
    let c2 ← a[i+2]?
    let c3 ← a[i+3]?
    guard (c1 &&& 0xc0 == 0x80 && c2 &&& 0xc0 == 0x80 && c3 &&& 0xc0 == 0x80)
    let r :=
      ((c &&& 0x07).toUInt32 <<< 18) |||
      ((c1 &&& 0x3f).toUInt32 <<< 12) |||
      ((c2 &&& 0x3f).toUInt32 <<< 6) |||
      (c3 &&& 0x3f).toUInt32
    if h : 0x10000 ≤ r ∧ r < 0x110000 then
      some ⟨r, .inr ⟨Nat.lt_of_lt_of_le (by decide) ((UInt32.ofNat_le_iff (by decide)).1 h.left), (UInt32.lt_ofNat_iff (by decide)).1 h.right⟩⟩
    else none
  else
    none

UTF-8 character decoding at index

Informal description

Given a byte array `a` and an index `i`, the function attempts to decode the UTF-8 character sequence starting at index `i` in `a`. It returns `some c` if `i` is within bounds and the bytes starting at `i` form a valid UTF-8 encoded character `c`, and `none` otherwise. The decoding follows the UTF-8 encoding rules: 1. For a 1-byte sequence (ASCII), the character is directly the byte value if it is less than 0x80. 2. For a 2-byte sequence, the first byte must start with `0xc0` and the second with `0x80`, and the decoded value must be at least 0x80. 3. For a 3-byte sequence, the first byte must start with `0xe0` and the next two with `0x80`, and the decoded value must be at least 0x800 and not a surrogate. 4. For a 4-byte sequence, the first byte must start with `0xf0` and the next three with `0x80`, and the decoded value must be between 0x10000 and 0x10FFFF.

String.validateUTF8 definition

(a : @& ByteArray) : Bool

Full source

/--
Checks whether an array of bytes is a valid UTF-8 encoding of a string.
-/
@[extern "lean_string_validate_utf8"]
def validateUTF8 (a : @& ByteArray) : Bool :=
  (loop 0).isSome
where
  loop (i : Nat) : Option Unit := do
    if i < a.size then
      let c ← utf8DecodeChar? a i
      loop (i + c.utf8Size)
    else pure ()
  termination_by a.size - i
  decreasing_by exact Nat.sub_lt_sub_left ‹_› (Nat.lt_add_of_pos_right c.utf8Size_pos)

UTF-8 validation for byte arrays

Informal description

The function checks whether a given byte array `a` is a valid UTF-8 encoding of a string. It does this by iterating through the byte array, attempting to decode each character in sequence according to UTF-8 rules, and ensuring the entire array is consumed without any invalid sequences.

String.fromUTF8 definition

(a : @& ByteArray) (h : validateUTF8 a) : String

Full source

/--
Decodes an array of bytes that encode a string as [UTF-8](https://en.wikipedia.org/wiki/UTF-8) into
the corresponding string. Invalid UTF-8 characters in the byte array result in `(default : Char)`,
or `'A'`, in the string.
-/
@[extern "lean_string_from_utf8_unchecked"]
def fromUTF8 (a : @& ByteArray) (h : validateUTF8 a) : String :=
  loop 0 ""
where
  loop (i : Nat) (acc : String) : String :=
    if i < a.size then
      let c := (utf8DecodeChar? a i).getD default
      loop (i + c.utf8Size) (acc.push c)
    else acc
  termination_by a.size - i
  decreasing_by exact Nat.sub_lt_sub_left ‹_› (Nat.lt_add_of_pos_right c.utf8Size_pos)

UTF-8 decoding of validated byte array to string

Informal description

Given a byte array `a` that has been validated as a correct UTF-8 encoding (via `validateUTF8`), the function constructs the corresponding string by sequentially decoding each UTF-8 character in the array. Invalid sequences are replaced with a default character (typically `'A'`). The decoding process iterates through the byte array, accumulating characters into a string until the entire array is processed.

String.fromUTF8? definition

(a : ByteArray) : Option String

Full source

/--
Decodes an array of bytes that encode a string as [UTF-8](https://en.wikipedia.org/wiki/UTF-8) into
the corresponding string, or returns `none` if the array is not a valid UTF-8 encoding of a string.
-/
@[inline] def fromUTF8? (a : ByteArray) : Option String :=
  if h : validateUTF8 a then fromUTF8 a h else none

UTF-8 decoding of byte array to optional string

Informal description

The function takes a byte array `a` and returns an optional string (`Option String`). If `a` is a valid UTF-8 encoding of a string, it returns `some s` where `s` is the decoded string; otherwise, it returns `none`.

String.fromUTF8! definition

(a : ByteArray) : String

Full source

/--
Decodes an array of bytes that encode a string as [UTF-8](https://en.wikipedia.org/wiki/UTF-8) into
the corresponding string, or panics if the array is not a valid UTF-8 encoding of a string.
-/
@[inline] def fromUTF8! (a : ByteArray) : String :=
  if h : validateUTF8 a then fromUTF8 a h else panic! "invalid UTF-8 string"

UTF-8 decoding of byte array to string (with panic on invalid input)

Informal description

The function decodes a byte array `a` encoded in UTF-8 into a string. If the byte array is not a valid UTF-8 encoding, the function raises a panic with the message "invalid UTF-8 string".

String.utf8EncodeChar definition

(c : Char) : List UInt8

Full source

/--
Returns the sequence of bytes in a character's UTF-8 encoding.
-/
def utf8EncodeChar (c : Char) : List UInt8 :=
  let v := c.val
  if v ≤ 0x7f then
    [v.toUInt8]
  else if v ≤ 0x7ff then
    [(v >>>  6).toUInt8 &&& 0x1f ||| 0xc0,
              v.toUInt8 &&& 0x3f ||| 0x80]
  else if v ≤ 0xffff then
    [(v >>> 12).toUInt8 &&& 0x0f ||| 0xe0,
     (v >>>  6).toUInt8 &&& 0x3f ||| 0x80,
              v.toUInt8 &&& 0x3f ||| 0x80]
  else
    [(v >>> 18).toUInt8 &&& 0x07 ||| 0xf0,
     (v >>> 12).toUInt8 &&& 0x3f ||| 0x80,
     (v >>>  6).toUInt8 &&& 0x3f ||| 0x80,
              v.toUInt8 &&& 0x3f ||| 0x80]

UTF-8 encoding of a Unicode character

Informal description

The function takes a Unicode character `c` and returns a list of unsigned 8-bit integers representing its UTF-8 encoding. The encoding is determined as follows: - If the character's code point `v` is ≤ 0x7F, it is encoded as a single byte `[v]`. - If `v` ≤ 0x7FF, it is encoded as two bytes `[(v >>> 6) & 0x1F | 0xC0, v & 0x3F | 0x80]`. - If `v` ≤ 0xFFFF, it is encoded as three bytes `[(v >>> 12) & 0x0F | 0xE0, (v >>> 6) & 0x3F | 0x80, v & 0x3F | 0x80]`. - Otherwise, it is encoded as four bytes `[(v >>> 18) & 0x07 | 0xF0, (v >>> 12) & 0x3F | 0x80, (v >>> 6) & 0x3F | 0x80, v & 0x3F | 0x80]`.

String.length_utf8EncodeChar theorem

(c : Char) : (utf8EncodeChar c).length = c.utf8Size

Full source

@[simp] theorem length_utf8EncodeChar (c : Char) : (utf8EncodeChar c).length = c.utf8Size := by
  simp [Char.utf8Size, utf8EncodeChar]
  cases Decidable.em (c.val ≤ 0x7f) <;> simp [*]
  cases Decidable.em (c.val ≤ 0x7ff) <;> simp [*]
  cases Decidable.em (c.val ≤ 0xffff) <;> simp [*]

UTF-8 Encoding Length Equals Character's UTF-8 Size

Informal description

For any Unicode character $c$, the length of the list obtained by UTF-8 encoding $c$ is equal to the UTF-8 size of $c$.

String.toUTF8 definition

(a : @& String) : ByteArray

Full source

/--
Encodes a string in UTF-8 as an array of bytes.
-/
@[extern "lean_string_to_utf8"]
def toUTF8 (a : @& String) : ByteArray :=
  ⟨⟨a.data.flatMap utf8EncodeChar⟩⟩

UTF-8 encoding of a string

Informal description

The function takes a string `a` and returns its UTF-8 encoding as a byte array. The encoding is obtained by concatenating the UTF-8 encodings of each character in the string.

String.size_toUTF8 theorem

(s : String) : s.toUTF8.size = s.utf8ByteSize

Full source

@[simp] theorem size_toUTF8 (s : String) : s.toUTF8.size = s.utf8ByteSize := by
  simp [toUTF8, ByteArray.size, Array.size, utf8ByteSize, List.flatMap]
  induction s.data <;> simp [List.map, List.flatten, utf8ByteSize.go, Nat.add_comm, *]

UTF-8 Encoding Preserves Byte Size: $\text{size}(\text{toUTF8}(s)) = \text{utf8ByteSize}(s)$

Informal description

For any string $s$, the size (in bytes) of its UTF-8 encoded byte array equals the UTF-8 byte size of $s$, i.e., $\text{size}(\text{toUTF8}(s)) = \text{utf8ByteSize}(s)$.

String.getUtf8Byte definition

(s : @& String) (n : Nat) (h : n < s.utf8ByteSize) : UInt8

Full source

/--
Accesses the indicated byte in the UTF-8 encoding of a string.

At runtime, this function is implemented by efficient, constant-time code.
-/
@[extern "lean_string_get_byte_fast"]
def getUtf8Byte (s : @& String) (n : Nat) (h : n < s.utf8ByteSize) : UInt8 :=
  (toUTF8 s)[n]'(size_toUTF8 _ ▸ h)

UTF-8 byte accessor for strings

Informal description

Given a string $s$ and a natural number $n$ such that $n$ is less than the UTF-8 byte size of $s$, the function returns the $n$-th byte in the UTF-8 encoding of $s$.

String.Iterator.sizeOf_next_lt_of_hasNext theorem

(i : String.Iterator) (h : i.hasNext) : sizeOf i.next < sizeOf i

Full source

theorem Iterator.sizeOf_next_lt_of_hasNext (i : String.Iterator) (h : i.hasNext) : sizeOf i.next < sizeOf i := by
  cases i; rename_i s pos; simp [Iterator.next, Iterator.sizeOf_eq]; simp [Iterator.hasNext] at h
  exact Nat.sub_lt_sub_left h (String.lt_next s pos)

Decreasing Size of String Iterator When Advancing

Informal description

For any string iterator $i$ that has a next character (i.e., $i.\text{hasNext}$ holds), the size of the iterator obtained by advancing to the next character ($i.\text{next}$) is strictly less than the size of the original iterator $i$.

String.Iterator.sizeOf_next_lt_of_atEnd theorem

(i : String.Iterator) (h : ¬i.atEnd = true) : sizeOf i.next < sizeOf i

Full source

theorem Iterator.sizeOf_next_lt_of_atEnd (i : String.Iterator) (h : ¬ i.atEnd = true) : sizeOf i.next < sizeOf i :=
  have h : i.hasNext := decide_eq_true <| Nat.gt_of_not_le <| mt decide_eq_true h
  sizeOf_next_lt_of_hasNext i h

Decreasing Size Property for String Iterators When Not at End

Informal description

For any string iterator $i$ that is not at the end of the string (i.e., $i.\text{atEnd}$ is not `true`), the size of the iterator obtained by advancing to the next character ($i.\text{next}$) is strictly less than the size of the original iterator $i$.

String.Iterator.find definition

(it : Iterator) (p : Char → Bool) : Iterator

Full source

/--
Moves the iterator forward until the Boolean predicate `p` returns `true` for the iterator's current
character or until the end of the string is reached. Does nothing if the current character already
satisfies `p`.
-/
@[specialize] def find (it : Iterator) (p : Char → Bool) : Iterator :=
  if it.atEnd then it
  else if p it.curr then it
  else find it.next p

String iterator find operation

Informal description

Given a string iterator `it` and a predicate `p` on characters, the function `find` advances the iterator until either the current character satisfies `p` or the end of the string is reached. If the iterator is already at the end or the current character satisfies `p`, the iterator is returned unchanged.

String.Iterator.foldUntil definition

(it : Iterator) (init : α) (f : α → Char → Option α) : α × Iterator

Full source

/--
Iterates over a string, updating a state at each character using the provided function `f`, until
`f` returns `none`. Begins with the state `init`. Returns the state and character for which `f`
returns `none`.
-/
@[specialize] def foldUntil (it : Iterator) (init : α) (f : α → Char → Option α) : α × Iterator :=
  if it.atEnd then
    (init, it)
  else if let some a := f init it.curr then
    foldUntil it.next a f
  else
    (init, it)

String iterator fold with early termination

Informal description

Given a string iterator `it`, an initial state `init`, and a function `f : α → Char → Option α`, the function `foldUntil` iterates over the string starting from `it`, applying `f` to each character and the current state. The iteration continues until either the iterator reaches the end of the string or `f` returns `none`. The function returns the final state and the iterator position at which `f` returned `none` or the end of the string was reached.

String.removeLeadingSpaces definition

(s : String) : String

Full source

/--
Consistently de-indents the lines in a string, removing the same amount of leading whitespace from
each line such that the least-indented line has no leading whitespace.

The number of leading whitespace characters to remove from each line is determined by counting the
number of leading space (`' '`) and tab (`'\t'`) characters on lines after the first line that also
contain non-whitespace characters. No distinction is made between tab and space characters; both
count equally.

The least number of leading whitespace characters found is then removed from the beginning of each
line. The first line's leading whitespace is not counted when determining how far to de-indent the
string, but leading whitespace is removed from it.

Examples:
* `"Here:\n  fun x =>\n    x + 1".removeLeadingSpaces = "Here:\nfun x =>\n  x + 1"`
* `"Here:\n\t\tfun x =>\n\t  \tx + 1".removeLeadingSpaces = "Here:\nfun x =>\n \tx + 1"`
* `"Here:\n\t\tfun x =>\n \n\t  \tx + 1".removeLeadingSpaces = "Here:\nfun x =>\n\n \tx + 1"`
-/
def removeLeadingSpaces (s : String) : String :=
  let n := findLeadingSpacesSize s
  if n == 0 then s else removeNumLeadingSpaces n s

String de-indentation

Informal description

The function removes a consistent amount of leading whitespace from each line in a string such that the least-indented line has no leading whitespace. The amount of whitespace removed is determined by counting the leading spaces and tabs on lines that contain non-whitespace characters (excluding the first line). The same number of leading whitespace characters is then removed from all lines, including the first line. Examples: - `"Here:\n fun x =>\n x + 1".removeLeadingSpaces` becomes `"Here:\nfun x =>\n x + 1"` - `"Here:\n\t\tfun x =>\n\t \tx + 1".removeLeadingSpaces` becomes `"Here:\nfun x =>\n \tx + 1"` - `"Here:\n\t\tfun x =>\n \n\t \tx + 1".removeLeadingSpaces` becomes `"Here:\nfun x =>\n\n \tx + 1"`

String.crlfToLf definition

(text : String) : String

Full source

/--
Replaces each `\r\n` with `\n` to normalize line endings, but does not validate that there are no
isolated `\r` characters.

This is an optimized version of `String.replace text "\r\n" "\n"`.
-/
def crlfToLf (text : String) : String :=
  go "" 0 0
where
  go (acc : String) (accStop pos : String.Pos) : String :=
    if h : text.atEnd pos then
      -- note: if accStop = 0 then acc is empty
      if accStop = 0 then text else acc ++ text.extract accStop pos
    else
      let c := text.get' pos h
      let pos' := text.next' pos h
      if h' : ¬ text.atEnd pos' ∧ c == '\r' ∧ text.get pos' == '\n' then
        let acc := acc ++ text.extract accStop pos
        go acc pos' (text.next' pos' h'.1)
      else
        go acc accStop pos'
  termination_by text.utf8ByteSize - pos.byteIdx
  decreasing_by
    decreasing_with
      show text.utf8ByteSize - (text.next (text.next pos)).byteIdx < text.utf8ByteSize - pos.byteIdx
      have k := Nat.gt_of_not_le <| mt decide_eq_true h
      exact Nat.sub_lt_sub_left k (Nat.lt_trans (String.lt_next text pos) (String.lt_next _ _))
    decreasing_with
      show text.utf8ByteSize - (text.next pos).byteIdx < text.utf8ByteSize - pos.byteIdx
      have k := Nat.gt_of_not_le <| mt decide_eq_true h
      exact Nat.sub_lt_sub_left k (String.lt_next _ _)

Normalize line endings from Windows to Unix style

Informal description

The function replaces all occurrences of the Windows-style line ending `\r\n` with the Unix-style line ending `\n` in the input string `text`, effectively normalizing line endings. It does not check for isolated `\r` characters that are not followed by `\n`.