diff --git a/src/ftxui/component/terminal_input_parser.cpp b/src/ftxui/component/terminal_input_parser.cpp index 63c57ae..fb7e3ad 100644 --- a/src/ftxui/component/terminal_input_parser.cpp +++ b/src/ftxui/component/terminal_input_parser.cpp @@ -93,14 +93,78 @@ TerminalInputParser::Output TerminalInputParser::Parse() { return ParseUTF8(); } +// Code point <-> UTF-8 conversion +// +// ┏━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┓ +// ┃Byte 1 ┃Byte 2 ┃Byte 3 ┃Byte 4 ┃ +// ┡━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━┩ +// │0xxxxxxx│ │ │ │ +// ├────────┼────────┼────────┼────────┤ +// │110xxxxx│10xxxxxx│ │ │ +// ├────────┼────────┼────────┼────────┤ +// │1110xxxx│10xxxxxx│10xxxxxx│ │ +// ├────────┼────────┼────────┼────────┤ +// │11110xxx│10xxxxxx│10xxxxxx│10xxxxxx│ +// └────────┴────────┴────────┴────────┘ +// +// Then some sequences are illegal if it exist a shorter representation of the +// same codepoint. TerminalInputParser::Output TerminalInputParser::ParseUTF8() { unsigned char head = static_cast(Current()); - for (int i = 0; i < 3; ++i, head <<= 1) { - if ((head & 0b11000000) != 0b11000000) - break; + unsigned char selector = 0b1000'0000; + + // The non code-point part of the first byte. + unsigned char mask = selector; + + // Find the first zero in the first byte. + int first_zero = 8; + for(int i = 0; i<8; ++i) { + mask |= selector; + if (head & selector) { + selector >>= 1; + continue; + } + first_zero = i; + break; + } + + // Accumulate the value of the first byte. + wchar_t value = head & ~mask; + + // Invalid UTF8, with more than 5 bytes. + if (first_zero == 1 || first_zero >= 5) + return DROP; + + // Multi byte UTF-8. + for (int i = 2; i <= first_zero; ++i) { if (!Eat()) return UNCOMPLETED; + + // Invalid continuation byte. + head = static_cast(Current()); + if ((head & 0b1100'0000) != 0b1000'0000) + return DROP; + value <<= 6; + value += head & 0b0011'1111; } + + // Check for overlong UTF8 encoding. + int extra_byte; + if (value <= 0b000'0000'0111'1111) { + extra_byte = 0; + } else if (value <= 0b000'0111'1111'1111) { + extra_byte = 1; + } else if (value <= 0b1111'1111'1111'1111) { + extra_byte = 2; + } else if (value <= 0b1'0000'1111'1111'1111'1111) { + extra_byte = 3; + } else { + return DROP; + } + + if (extra_byte != position_) + return DROP; + return CHARACTER; } diff --git a/src/ftxui/component/terminal_input_parser_test.cpp b/src/ftxui/component/terminal_input_parser_test.cpp index f2ca2d8..17d5765 100644 --- a/src/ftxui/component/terminal_input_parser_test.cpp +++ b/src/ftxui/component/terminal_input_parser_test.cpp @@ -149,6 +149,88 @@ TEST(Event, MouseRightClick) { EXPECT_FALSE(event_receiver->Receive(&received)); } +TEST(Event, UTF8) { + struct { + std::vector input; + bool valid; + } kTestCase[] = { + // Basic characters. + {{'a'}, true}, + {{'z'}, true}, + {{'A'}, true}, + {{'Z'}, true}, + {{'0'}, true}, + {{'9'}, true}, + + // UTF-8 of various size: + {{0b0100'0001}, true}, + {{0b1100'0010, 0b1000'0000}, true}, + {{0b1110'0010, 0b1000'0000, 0b1000'0000}, true}, + {{0b1111'0010, 0b1000'0000, 0b1000'0000, 0b1000'0000}, true}, + + // Overlong UTF-8 encoding: + {{0b1100'0000, 0b1000'0000}, false}, + {{0b1110'0000, 0b1000'0000, 0b1000'0000}, false}, + {{0b1111'0000, 0b1000'0000, 0b1000'0000, 0b1000'0000}, false}, + + // Test limits in between the various legal regions + // https://unicode.org/versions/corrigendum1.html + // Limit in between the valid and ina + // {{0x7F}, true}, => Special sequence. + {{0x80}, false}, + // --- + {{0xC1, 0x80}, false}, + {{0xC2, 0x7F}, false}, + {{0xC2, 0x80}, true}, + // --- + {{0xDF, 0xBF}, true}, + {{0xDF, 0xC0}, false}, + // --- + {{0xE0, 0x9F, 0x80}, false}, + {{0xE0, 0xA0, 0x7F}, false}, + {{0xE0, 0xA0, 0x80}, true}, + // --- + {{0xE0, 0xBF, 0xBF}, true}, + // --- + {{0xE1, 0x7F, 0x80}, false}, + {{0xE1, 0x80, 0x7f}, false}, + {{0xE1, 0x80, 0x80}, true}, + // -- + {{0xEF, 0xBF, 0xBF}, true}, + {{0xEF, 0xC0, 0xBF}, false}, + {{0xEF, 0xBF, 0xC0}, false}, + // -- + {{0xF0, 0x90, 0x80}, false}, + {{0xF0, 0x8F, 0x80, 0x80}, false}, + {{0xF0, 0x90, 0x80, 0x7F}, false}, + {{0xF0, 0x90, 0x80, 0x80}, true}, + // -- + {{0xF1, 0x80, 0x80, 0x80}, true}, + // -- + {{0xF1, 0xBF, 0xBF, 0xBF}, true}, + // -- + {{0xF2, 0x80, 0x80, 0x80}, true}, + // -- + {{0xF4, 0x8F, 0xBF, 0xBF}, true}, + {{0xF4, 0x90, 0xBF, 0xBF}, false}, + + }; + for (auto test : kTestCase) { + auto event_receiver = MakeReceiver(); + { + auto parser = TerminalInputParser(event_receiver->MakeSender()); + for (auto input : test.input) + parser.Add(input); + } + Event received; + if (test.valid) { + EXPECT_TRUE(event_receiver->Receive(&received)); + EXPECT_TRUE(received.is_character()); + } + EXPECT_FALSE(event_receiver->Receive(&received)); + } +} + // Copyright 2020 Arthur Sonzogni. All rights reserved. // Use of this source code is governed by the MIT license that can be found in // the LICENSE file.