1//! Types supporting the UTF-8 parser
23/// Action to take when receiving a byte
4#[derive(Debug, Copy, Clone)]
5pub enum Action {
6/// Unexpected byte; sequence is invalid
7InvalidSequence = 0,
8/// Received valid 7-bit ASCII byte which can be directly emitted.
9EmitByte = 1,
10/// Set the bottom continuation byte
11SetByte1 = 2,
12/// Set the 2nd-from-last continuation byte
13SetByte2 = 3,
14/// Set the 2nd-from-last byte which is part of a two byte sequence
15SetByte2Top = 4,
16/// Set the 3rd-from-last continuation byte
17SetByte3 = 5,
18/// Set the 3rd-from-last byte which is part of a three byte sequence
19SetByte3Top = 6,
20/// Set the top byte of a four byte sequence.
21SetByte4 = 7,
22}
2324/// States the parser can be in.
25///
26/// There is a state for each initial input of the 3 and 4 byte sequences since
27/// the following bytes are subject to different conditions than a tail byte.
28#[allow(non_camel_case_types)]
29#[derive(Debug, Default, Copy, Clone, PartialEq, Eq)]
30pub enum State {
31/// Ground state; expect anything
32#[default]
33Ground = 0,
34/// 3 tail bytes
35Tail3 = 1,
36/// 2 tail bytes
37Tail2 = 2,
38/// 1 tail byte
39Tail1 = 3,
40/// UTF8-3 starting with E0
41U3_2_e0 = 4,
42/// UTF8-3 starting with ED
43U3_2_ed = 5,
44/// UTF8-4 starting with F0
45Utf8_4_3_f0 = 6,
46/// UTF8-4 starting with F4
47Utf8_4_3_f4 = 7,
48}
4950impl State {
51/// Advance the parser state.
52 ///
53 /// This takes the current state and input byte into consideration, to determine the next state
54 /// and any action that should be taken.
55#[inline]
56pub fn advance(self, byte: u8) -> (State, Action) {
57match self {
58 State::Ground => match byte {
590x00..=0x7f => (State::Ground, Action::EmitByte),
600xc2..=0xdf => (State::Tail1, Action::SetByte2Top),
610xe0 => (State::U3_2_e0, Action::SetByte3Top),
620xe1..=0xec => (State::Tail2, Action::SetByte3Top),
630xed => (State::U3_2_ed, Action::SetByte3Top),
640xee..=0xef => (State::Tail2, Action::SetByte3Top),
650xf0 => (State::Utf8_4_3_f0, Action::SetByte4),
660xf1..=0xf3 => (State::Tail3, Action::SetByte4),
670xf4 => (State::Utf8_4_3_f4, Action::SetByte4),
68_ => (State::Ground, Action::InvalidSequence),
69 },
70 State::U3_2_e0 => match byte {
710xa0..=0xbf => (State::Tail1, Action::SetByte2),
72_ => (State::Ground, Action::InvalidSequence),
73 },
74 State::U3_2_ed => match byte {
750x80..=0x9f => (State::Tail1, Action::SetByte2),
76_ => (State::Ground, Action::InvalidSequence),
77 },
78 State::Utf8_4_3_f0 => match byte {
790x90..=0xbf => (State::Tail2, Action::SetByte3),
80_ => (State::Ground, Action::InvalidSequence),
81 },
82 State::Utf8_4_3_f4 => match byte {
830x80..=0x8f => (State::Tail2, Action::SetByte3),
84_ => (State::Ground, Action::InvalidSequence),
85 },
86 State::Tail3 => match byte {
870x80..=0xbf => (State::Tail2, Action::SetByte3),
88_ => (State::Ground, Action::InvalidSequence),
89 },
90 State::Tail2 => match byte {
910x80..=0xbf => (State::Tail1, Action::SetByte2),
92_ => (State::Ground, Action::InvalidSequence),
93 },
94 State::Tail1 => match byte {
950x80..=0xbf => (State::Ground, Action::SetByte1),
96_ => (State::Ground, Action::InvalidSequence),
97 },
98 }
99 }
100}