From 50e71592887aabbe1f4630ec8bcab66fc193c694 Mon Sep 17 00:00:00 2001 From: Mitchell Hashimoto Date: Sun, 4 Feb 2024 10:24:23 -0800 Subject: [PATCH] simd: implement utf8 decode until esc in C++ --- build.zig | 5 +- src/simd/index_of.cpp | 83 ++----- src/simd/index_of.h | 96 ++++++++ src/simd/main.zig | 1 + src/simd/utf8.h | 46 ++++ src/simd/utf8/checked.h | 359 +++++++++++++++++++++++++++ src/simd/utf8/core.h | 492 ++++++++++++++++++++++++++++++++++++++ src/simd/utf8/cpp11.h | 70 ++++++ src/simd/utf8/cpp17.h | 96 ++++++++ src/simd/utf8/cpp20.h | 124 ++++++++++ src/simd/utf8/unchecked.h | 287 ++++++++++++++++++++++ src/simd/vt.cpp | 168 +++++++++++++ src/simd/vt.h | 30 +++ src/simd/vt.zig | 121 ++++++++++ 14 files changed, 1910 insertions(+), 68 deletions(-) create mode 100644 src/simd/index_of.h create mode 100644 src/simd/utf8.h create mode 100644 src/simd/utf8/checked.h create mode 100644 src/simd/utf8/core.h create mode 100644 src/simd/utf8/cpp11.h create mode 100644 src/simd/utf8/cpp17.h create mode 100644 src/simd/utf8/cpp20.h create mode 100644 src/simd/utf8/unchecked.h create mode 100644 src/simd/vt.cpp create mode 100644 src/simd/vt.h create mode 100644 src/simd/vt.zig diff --git a/build.zig b/build.zig index 9fd1802ed..e454a4c18 100644 --- a/build.zig +++ b/build.zig @@ -1010,7 +1010,10 @@ fn addDeps( step.addCSourceFiles(.{ .files = &.{"src/simd/simdutf_c.cpp"} }); step.addIncludePath(.{ .path = "src/terminal/simdvt" }); step.addCSourceFiles(.{ .files = &.{"src/terminal/simdvt/example.cpp"} }); - step.addCSourceFiles(.{ .files = &.{"src/simd/index_of.cpp"} }); + step.addCSourceFiles(.{ .files = &.{ + "src/simd/index_of.cpp", + "src/simd/vt.cpp", + } }); // If we're building a lib we have some different deps const lib = step.kind == .lib; diff --git a/src/simd/index_of.cpp b/src/simd/index_of.cpp index 38846adc1..b070ae3b6 100644 --- a/src/simd/index_of.cpp +++ b/src/simd/index_of.cpp @@ -1,76 +1,24 @@ // Generates code for every target that this compiler can support. #undef HWY_TARGET_INCLUDE #define HWY_TARGET_INCLUDE "simd/index_of.cpp" // this file -#include // must come before highway.h +#include // must come before highway.h #include +#include + +#include + HWY_BEFORE_NAMESPACE(); namespace ghostty { namespace HWY_NAMESPACE { namespace hn = hwy::HWY_NAMESPACE; -// Return the index of the first occurrence of `needle` in `input` or -// `count` if not found. -template > -size_t IndexOfImpl(D d, T needle, const T* HWY_RESTRICT input, size_t count) { - // Note: due to the simplicity of this operation and the general complexity - // of SIMD, I'm going to overly comment this function to help explain the - // implementation for future maintainers. - - // The number of lanes in the vector type. - const size_t N = hn::Lanes(d); - - // Create a vector with all lanes set to `needle` so we can do a lane-wise - // comparison with the input. - const hn::Vec needle_vec = Set(d, needle); - - // Compare N elements at a time. - size_t i = 0; - for (; i + N <= count; i += N) { - // Load the N elements from our input into a vector. - const hn::Vec input_vec = hn::LoadU(d, input + i); - - // Compare the input vector with the needle vector. This produces - // a vector where each lane is 0xFF if the corresponding lane in - // `input_vec` is equal to the corresponding lane in `needle_vec`. - const hn::Mask eq_mask = hn::Eq(needle_vec, input_vec); - - // Find the index within the vector where the first true value is. - const intptr_t pos = hn::FindFirstTrue(d, eq_mask); - - // If we found a match, return the index into the input. - if (pos >= 0) return i + static_cast(pos); - } - - // Since we compare N elements at a time, we may have some elements left - // if count modulo N != 0. We need to scan the remaining elements. To - // be simple, we search one element at a time. - if (i != count) { - // Create a new vector with only one relevant lane. - const hn::CappedTag d1; - using D1 = decltype(d1); - - // Get an equally sized needle vector with only one lane. - const hn::Vec needle1 = Set(d1, GetLane(needle_vec)); - - // Go through the remaining elements and do similar logic to - // the previous loop to find any matches. - for (; i < count; ++i) { - const hn::Vec input_vec = hn::LoadU(d1, input + i); - const hn::Mask eq_mask = hn::Eq(needle1, input_vec); - if (hn::AllTrue(d1, eq_mask)) return i; - } - } - - return count; -} - size_t IndexOf(const uint8_t needle, - const uint8_t* HWY_RESTRICT input, - size_t count) { - const hn::ScalableTag d; - return IndexOfImpl(d, needle, input, count); + const uint8_t* HWY_RESTRICT input, + size_t count) { + const hn::ScalableTag d; + return IndexOfImpl(d, needle, input, count); } } // namespace HWY_NAMESPACE @@ -86,19 +34,20 @@ namespace ghostty { HWY_EXPORT(IndexOf); size_t IndexOf(const uint8_t needle, - const uint8_t* HWY_RESTRICT input, - size_t count) { - return HWY_DYNAMIC_DISPATCH(IndexOf)(needle, input, count); + const uint8_t* HWY_RESTRICT input, + size_t count) { + return HWY_DYNAMIC_DISPATCH(IndexOf)(needle, input, count); } } // namespace ghostty extern "C" { -size_t ghostty_simd_index_of(const uint8_t needle, const uint8_t* HWY_RESTRICT input, size_t count) { - return ghostty::IndexOf(needle, input, count); +size_t ghostty_simd_index_of(const uint8_t needle, + const uint8_t* HWY_RESTRICT input, + size_t count) { + return ghostty::IndexOf(needle, input, count); } - } #endif // HWY_ONCE diff --git a/src/simd/index_of.h b/src/simd/index_of.h new file mode 100644 index 000000000..2515f7e0a --- /dev/null +++ b/src/simd/index_of.h @@ -0,0 +1,96 @@ +#if defined(GHOSTTY_SIMD_INDEX_OF_H_) == defined(HWY_TARGET_TOGGLE) +#ifdef GHOSTTY_SIMD_INDEX_OF_H_ +#undef GHOSTTY_SIMD_INDEX_OF_H_ +#else +#define GHOSTTY_SIMD_INDEX_OF_H_ +#endif + +#include +#include + +HWY_BEFORE_NAMESPACE(); +namespace ghostty { +namespace HWY_NAMESPACE { + +namespace hn = hwy::HWY_NAMESPACE; + +// Return the index of the first occurrence of `needle` in `input`, where +// the input and needle are already loaded into vectors. +template > +std::optional IndexOfChunk(D d, + hn::Vec needle_vec, + hn::Vec input_vec) { + // Compare the input vector with the needle vector. This produces + // a vector where each lane is 0xFF if the corresponding lane in + // `input_vec` is equal to the corresponding lane in `needle_vec`. + const hn::Mask eq_mask = hn::Eq(needle_vec, input_vec); + + // Find the index within the vector where the first true value is. + const intptr_t pos = hn::FindFirstTrue(d, eq_mask); + + // If we found a match, return the index into the input. + if (pos >= 0) { + return std::optional(static_cast(pos)); + } else { + return std::nullopt; + } +} + +// Return the index of the first occurrence of `needle` in `input` or +// `count` if not found. +template > +size_t IndexOfImpl(D d, T needle, const T* HWY_RESTRICT input, size_t count) { + // Note: due to the simplicity of this operation and the general complexity + // of SIMD, I'm going to overly comment this function to help explain the + // implementation for future maintainers. + + // The number of lanes in the vector type. + const size_t N = hn::Lanes(d); + + // Create a vector with all lanes set to `needle` so we can do a lane-wise + // comparison with the input. + const hn::Vec needle_vec = Set(d, needle); + + // Compare N elements at a time. + size_t i = 0; + for (; i + N <= count; i += N) { + // Load the N elements from our input into a vector and check the chunk. + const hn::Vec input_vec = hn::LoadU(d, input + i); + if (auto pos = IndexOfChunk(d, needle_vec, input_vec)) { + return i + pos.value(); + } + } + + // Since we compare N elements at a time, we may have some elements left + // if count modulo N != 0. We need to scan the remaining elements. To + // be simple, we search one element at a time. + if (i != count) { + // Create a new vector with only one relevant lane. + const hn::CappedTag d1; + using D1 = decltype(d1); + + // Get an equally sized needle vector with only one lane. + const hn::Vec needle1 = Set(d1, GetLane(needle_vec)); + + // Go through the remaining elements and do similar logic to + // the previous loop to find any matches. + for (; i < count; ++i) { + const hn::Vec input_vec = hn::LoadU(d1, input + i); + const hn::Mask eq_mask = hn::Eq(needle1, input_vec); + if (hn::AllTrue(d1, eq_mask)) + return i; + } + } + + return count; +} + +size_t IndexOf(const uint8_t needle, + const uint8_t* HWY_RESTRICT input, + size_t count); + +} // namespace HWY_NAMESPACE +} // namespace ghostty +HWY_AFTER_NAMESPACE(); + +#endif // GHOSTTY_SIMD_INDEX_OF_H_ diff --git a/src/simd/main.zig b/src/simd/main.zig index 4523c1b40..3ce5d18ab 100644 --- a/src/simd/main.zig +++ b/src/simd/main.zig @@ -8,6 +8,7 @@ pub const utf8_count = @import("utf8_count.zig"); pub const utf8_decode = @import("utf8_decode.zig"); pub const utf8_validate = @import("utf8_validate.zig"); pub const index_of = @import("index_of.zig"); +pub const vt = @import("vt.zig"); // TODO: temporary, only for zig build simd to inspect disasm easily // pub fn main() !void { diff --git a/src/simd/utf8.h b/src/simd/utf8.h new file mode 100644 index 000000000..b51353093 --- /dev/null +++ b/src/simd/utf8.h @@ -0,0 +1,46 @@ +// Copyright 2006 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +/* +To control the C++ language version used by the library, you can define UTF_CPP_CPLUSPLUS macro +and set it to one of the values used by the __cplusplus predefined macro. + +For instance, + #define UTF_CPP_CPLUSPLUS 199711L +will cause the UTF-8 CPP library to use only types and language features available in the C++ 98 standard. +Some library features will be disabled. + +If you leave UTF_CPP_CPLUSPLUS undefined, it will be internally assigned to __cplusplus. +*/ + +#include "utf8/checked.h" +#include "utf8/unchecked.h" + +#endif // header guard diff --git a/src/simd/utf8/checked.h b/src/simd/utf8/checked.h new file mode 100644 index 000000000..98949f8bc --- /dev/null +++ b/src/simd/utf8/checked.h @@ -0,0 +1,359 @@ +// Copyright 2006-2016 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +#include "core.h" +#include + +namespace utf8 +{ + // Base for the exceptions that may be thrown from the library + class exception : public ::std::exception { + }; + + // Exceptions that may be thrown from the library functions. + class invalid_code_point : public exception { + utfchar32_t cp; + public: + invalid_code_point(utfchar32_t codepoint) : cp(codepoint) {} + virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid code point"; } + utfchar32_t code_point() const {return cp;} + }; + + class invalid_utf8 : public exception { + utfchar8_t u8; + public: + invalid_utf8 (utfchar8_t u) : u8(u) {} + invalid_utf8 (char c) : u8(static_cast(c)) {} + virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-8"; } + utfchar8_t utf8_octet() const {return u8;} + }; + + class invalid_utf16 : public exception { + utfchar16_t u16; + public: + invalid_utf16 (utfchar16_t u) : u16(u) {} + virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-16"; } + utfchar16_t utf16_word() const {return u16;} + }; + + class not_enough_room : public exception { + public: + virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Not enough space"; } + }; + + /// The library API - functions intended to be called by the users + + template + octet_iterator append(utfchar32_t cp, octet_iterator result) + { + if (!utf8::internal::is_code_point_valid(cp)) + throw invalid_code_point(cp); + + return internal::append(cp, result); + } + + inline void append(utfchar32_t cp, std::string& s) + { + append(cp, std::back_inserter(s)); + } + + template + word_iterator append16(utfchar32_t cp, word_iterator result) + { + if (!utf8::internal::is_code_point_valid(cp)) + throw invalid_code_point(cp); + + return internal::append16(cp, result); + } + + template + output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement) + { + while (start != end) { + octet_iterator sequence_start = start; + internal::utf_error err_code = utf8::internal::validate_next(start, end); + switch (err_code) { + case internal::UTF8_OK : + for (octet_iterator it = sequence_start; it != start; ++it) + *out++ = *it; + break; + case internal::NOT_ENOUGH_ROOM: + out = utf8::append (replacement, out); + start = end; + break; + case internal::INVALID_LEAD: + out = utf8::append (replacement, out); + ++start; + break; + case internal::INCOMPLETE_SEQUENCE: + case internal::OVERLONG_SEQUENCE: + case internal::INVALID_CODE_POINT: + out = utf8::append (replacement, out); + ++start; + // just one replacement mark for the sequence + while (start != end && utf8::internal::is_trail(*start)) + ++start; + break; + } + } + return out; + } + + template + inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) + { + static const utfchar32_t replacement_marker = utf8::internal::mask16(0xfffd); + return utf8::replace_invalid(start, end, out, replacement_marker); + } + + inline std::string replace_invalid(const std::string& s, utfchar32_t replacement) + { + std::string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); + return result; + } + + inline std::string replace_invalid(const std::string& s) + { + std::string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + template + utfchar32_t next(octet_iterator& it, octet_iterator end) + { + utfchar32_t cp = 0; + internal::utf_error err_code = utf8::internal::validate_next(it, end, cp); + switch (err_code) { + case internal::UTF8_OK : + break; + case internal::NOT_ENOUGH_ROOM : + throw not_enough_room(); + case internal::INVALID_LEAD : + case internal::INCOMPLETE_SEQUENCE : + case internal::OVERLONG_SEQUENCE : + throw invalid_utf8(static_cast(*it)); + case internal::INVALID_CODE_POINT : + throw invalid_code_point(cp); + } + return cp; + } + + template + utfchar32_t next16(word_iterator& it, word_iterator end) + { + utfchar32_t cp = 0; + internal::utf_error err_code = utf8::internal::validate_next16(it, end, cp); + if (err_code == internal::NOT_ENOUGH_ROOM) + throw not_enough_room(); + return cp; + } + + template + utfchar32_t peek_next(octet_iterator it, octet_iterator end) + { + return utf8::next(it, end); + } + + template + utfchar32_t prior(octet_iterator& it, octet_iterator start) + { + // can't do much if it == start + if (it == start) + throw not_enough_room(); + + octet_iterator end = it; + // Go back until we hit either a lead octet or start + while (utf8::internal::is_trail(*(--it))) + if (it == start) + throw invalid_utf8(*it); // error - no lead byte in the sequence + return utf8::peek_next(it, end); + } + + template + void advance (octet_iterator& it, distance_type n, octet_iterator end) + { + const distance_type zero(0); + if (n < zero) { + // backward + for (distance_type i = n; i < zero; ++i) + utf8::prior(it, end); + } else { + // forward + for (distance_type i = zero; i < n; ++i) + utf8::next(it, end); + } + } + + template + typename std::iterator_traits::difference_type + distance (octet_iterator first, octet_iterator last) + { + typename std::iterator_traits::difference_type dist; + for (dist = 0; first < last; ++dist) + utf8::next(first, last); + return dist; + } + + template + octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) + { + while (start != end) { + utfchar32_t cp = utf8::internal::mask16(*start++); + // Take care of surrogate pairs first + if (utf8::internal::is_lead_surrogate(cp)) { + if (start != end) { + const utfchar32_t trail_surrogate = utf8::internal::mask16(*start++); + if (utf8::internal::is_trail_surrogate(trail_surrogate)) + cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; + else + throw invalid_utf16(static_cast(trail_surrogate)); + } + else + throw invalid_utf16(static_cast(cp)); + + } + // Lone trail surrogate + else if (utf8::internal::is_trail_surrogate(cp)) + throw invalid_utf16(static_cast(cp)); + + result = utf8::append(cp, result); + } + return result; + } + + template + u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) + { + while (start < end) { + const utfchar32_t cp = utf8::next(start, end); + if (cp > 0xffff) { //make a surrogate pair + *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); + *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); + } + else + *result++ = static_cast(cp); + } + return result; + } + + template + octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) + { + while (start != end) + result = utf8::append(*(start++), result); + + return result; + } + + template + u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) + { + while (start < end) + (*result++) = utf8::next(start, end); + + return result; + } + + // The iterator class + template + class iterator { + octet_iterator it; + octet_iterator range_start; + octet_iterator range_end; + public: + typedef utfchar32_t value_type; + typedef utfchar32_t* pointer; + typedef utfchar32_t& reference; + typedef std::ptrdiff_t difference_type; + typedef std::bidirectional_iterator_tag iterator_category; + iterator () {} + explicit iterator (const octet_iterator& octet_it, + const octet_iterator& rangestart, + const octet_iterator& rangeend) : + it(octet_it), range_start(rangestart), range_end(rangeend) + { + if (it < range_start || it > range_end) + throw std::out_of_range("Invalid utf-8 iterator position"); + } + // the default "big three" are OK + octet_iterator base () const { return it; } + utfchar32_t operator * () const + { + octet_iterator temp = it; + return utf8::next(temp, range_end); + } + bool operator == (const iterator& rhs) const + { + if (range_start != rhs.range_start || range_end != rhs.range_end) + throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); + return (it == rhs.it); + } + bool operator != (const iterator& rhs) const + { + return !(operator == (rhs)); + } + iterator& operator ++ () + { + utf8::next(it, range_end); + return *this; + } + iterator operator ++ (int) + { + iterator temp = *this; + utf8::next(it, range_end); + return temp; + } + iterator& operator -- () + { + utf8::prior(it, range_start); + return *this; + } + iterator operator -- (int) + { + iterator temp = *this; + utf8::prior(it, range_start); + return temp; + } + }; // class iterator + +} // namespace utf8 + +#if UTF_CPP_CPLUSPLUS >= 202002L // C++ 20 or later +#include "cpp20.h" +#elif UTF_CPP_CPLUSPLUS >= 201703L // C++ 17 or later +#include "cpp17.h" +#elif UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later +#include "cpp11.h" +#endif // C++ 11 or later + +#endif //header guard + diff --git a/src/simd/utf8/core.h b/src/simd/utf8/core.h new file mode 100644 index 000000000..4494c538e --- /dev/null +++ b/src/simd/utf8/core.h @@ -0,0 +1,492 @@ +// Copyright 2006 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +#include +#include +#include + +// Determine the C++ standard version. +// If the user defines UTF_CPP_CPLUSPLUS, use that. +// Otherwise, trust the unreliable predefined macro __cplusplus + +#if !defined UTF_CPP_CPLUSPLUS + #define UTF_CPP_CPLUSPLUS __cplusplus +#endif + +#if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later + #define UTF_CPP_OVERRIDE override + #define UTF_CPP_NOEXCEPT noexcept +#else // C++ 98/03 + #define UTF_CPP_OVERRIDE + #define UTF_CPP_NOEXCEPT throw() +#endif // C++ 11 or later + + +namespace utf8 +{ +// The typedefs for 8-bit, 16-bit and 32-bit code units +#if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later + #if UTF_CPP_CPLUSPLUS >= 202002L // C++ 20 or later + typedef char8_t utfchar8_t; + #else // C++ 11/14/17 + typedef unsigned char utfchar8_t; + #endif + typedef char16_t utfchar16_t; + typedef char32_t utfchar32_t; +#else // C++ 98/03 + typedef unsigned char utfchar8_t; + typedef unsigned short utfchar16_t; + typedef unsigned int utfchar32_t; +#endif // C++ 11 or later + +// Helper code - not intended to be directly called by the library users. May be changed at any time +namespace internal +{ + // Unicode constants + // Leading (high) surrogates: 0xd800 - 0xdbff + // Trailing (low) surrogates: 0xdc00 - 0xdfff + const utfchar16_t LEAD_SURROGATE_MIN = 0xd800u; + const utfchar16_t LEAD_SURROGATE_MAX = 0xdbffu; + const utfchar16_t TRAIL_SURROGATE_MIN = 0xdc00u; + const utfchar16_t TRAIL_SURROGATE_MAX = 0xdfffu; + const utfchar16_t LEAD_OFFSET = 0xd7c0u; // LEAD_SURROGATE_MIN - (0x10000 >> 10) + const utfchar32_t SURROGATE_OFFSET = 0xfca02400u; // 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN + + // Maximum valid value for a Unicode code point + const utfchar32_t CODE_POINT_MAX = 0x0010ffffu; + + template + inline utfchar8_t mask8(octet_type oc) + { + return static_cast(0xff & oc); + } + template + inline utfchar16_t mask16(u16_type oc) + { + return static_cast(0xffff & oc); + } + + template + inline bool is_trail(octet_type oc) + { + return ((utf8::internal::mask8(oc) >> 6) == 0x2); + } + + inline bool is_lead_surrogate(utfchar32_t cp) + { + return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX); + } + + inline bool is_trail_surrogate(utfchar32_t cp) + { + return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); + } + + inline bool is_surrogate(utfchar32_t cp) + { + return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); + } + + inline bool is_code_point_valid(utfchar32_t cp) + { + return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp)); + } + + inline bool is_in_bmp(utfchar32_t cp) + { + return cp < utfchar32_t(0x10000); + } + + template + int sequence_length(octet_iterator lead_it) + { + const utfchar8_t lead = utf8::internal::mask8(*lead_it); + if (lead < 0x80) + return 1; + else if ((lead >> 5) == 0x6) + return 2; + else if ((lead >> 4) == 0xe) + return 3; + else if ((lead >> 3) == 0x1e) + return 4; + else + return 0; + } + + inline bool is_overlong_sequence(utfchar32_t cp, int length) + { + if (cp < 0x80) { + if (length != 1) + return true; + } + else if (cp < 0x800) { + if (length != 2) + return true; + } + else if (cp < 0x10000) { + if (length != 3) + return true; + } + return false; + } + + enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT}; + + /// Helper for get_sequence_x + template + utf_error increase_safely(octet_iterator& it, const octet_iterator end) + { + if (++it == end) + return NOT_ENOUGH_ROOM; + + if (!utf8::internal::is_trail(*it)) + return INCOMPLETE_SEQUENCE; + + return UTF8_OK; + } + + #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;} + + /// get_sequence_x functions decode utf-8 sequences of the length x + template + utf_error get_sequence_1(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + code_point = utf8::internal::mask8(*it); + + return UTF8_OK; + } + + template + utf_error get_sequence_2(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + code_point = utf8::internal::mask8(*it); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f); + + return UTF8_OK; + } + + template + utf_error get_sequence_3(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + code_point = utf8::internal::mask8(*it); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point += (*it) & 0x3f; + + return UTF8_OK; + } + + template + utf_error get_sequence_4(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + code_point = utf8::internal::mask8(*it); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point += (utf8::internal::mask8(*it) << 6) & 0xfff; + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point += (*it) & 0x3f; + + return UTF8_OK; + } + + #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR + + template + utf_error validate_next(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + // Save the original value of it so we can go back in case of failure + // Of course, it does not make much sense with i.e. stream iterators + octet_iterator original_it = it; + + utfchar32_t cp = 0; + // Determine the sequence length based on the lead octet + const int length = utf8::internal::sequence_length(it); + + // Get trail octets and calculate the code point + utf_error err = UTF8_OK; + switch (length) { + case 0: + return INVALID_LEAD; + case 1: + err = utf8::internal::get_sequence_1(it, end, cp); + break; + case 2: + err = utf8::internal::get_sequence_2(it, end, cp); + break; + case 3: + err = utf8::internal::get_sequence_3(it, end, cp); + break; + case 4: + err = utf8::internal::get_sequence_4(it, end, cp); + break; + } + + if (err == UTF8_OK) { + // Decoding succeeded. Now, security checks... + if (utf8::internal::is_code_point_valid(cp)) { + if (!utf8::internal::is_overlong_sequence(cp, length)){ + // Passed! Return here. + code_point = cp; + ++it; + return UTF8_OK; + } + else + err = OVERLONG_SEQUENCE; + } + else + err = INVALID_CODE_POINT; + } + + // Failure branch - restore the original value of the iterator + it = original_it; + return err; + } + + template + inline utf_error validate_next(octet_iterator& it, octet_iterator end) { + utfchar32_t ignored; + return utf8::internal::validate_next(it, end, ignored); + } + + template + utf_error validate_next16(word_iterator& it, word_iterator end, utfchar32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + // Save the original value of it so we can go back in case of failure + // Of course, it does not make much sense with i.e. stream iterators + word_iterator original_it = it; + + utf_error err = UTF8_OK; + + const utfchar16_t first_word = *it++; + if (!is_surrogate(first_word)) { + code_point = first_word; + return UTF8_OK; + } + else { + if (it == end) + err = NOT_ENOUGH_ROOM; + else if (is_lead_surrogate(first_word)) { + const utfchar16_t second_word = *it++; + if (is_trail_surrogate(second_word)) { + code_point = (first_word << 10) + second_word + SURROGATE_OFFSET; + return UTF8_OK; + } else + err = INCOMPLETE_SEQUENCE; + + } else { + err = INVALID_LEAD; + } + } + // error branch + it = original_it; + return err; + } + + // Internal implementation of both checked and unchecked append() function + // This function will be invoked by the overloads below, as they will know + // the octet_type. + template + octet_iterator append(utfchar32_t cp, octet_iterator result) { + if (cp < 0x80) // one octet + *(result++) = static_cast(cp); + else if (cp < 0x800) { // two octets + *(result++) = static_cast((cp >> 6) | 0xc0); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + else if (cp < 0x10000) { // three octets + *(result++) = static_cast((cp >> 12) | 0xe0); + *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + else { // four octets + *(result++) = static_cast((cp >> 18) | 0xf0); + *(result++) = static_cast(((cp >> 12) & 0x3f)| 0x80); + *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + return result; + } + + // One of the following overloads will be invoked from the API calls + + // A simple (but dangerous) case: the caller appends byte(s) to a char array + inline char* append(utfchar32_t cp, char* result) { + return append(cp, result); + } + + // Hopefully, most common case: the caller uses back_inserter + // i.e. append(cp, std::back_inserter(str)); + template + std::back_insert_iterator append + (utfchar32_t cp, std::back_insert_iterator result) { + return append, + typename container_type::value_type>(cp, result); + } + + // The caller uses some other kind of output operator - not covered above + // Note that in this case we are not able to determine octet_type + // so we assume it's utfchar8_t; that can cause a conversion warning if we are wrong. + template + octet_iterator append(utfchar32_t cp, octet_iterator result) { + return append(cp, result); + } + + // Internal implementation of both checked and unchecked append16() function + // This function will be invoked by the overloads below, as they will know + // the word_type. + template + word_iterator append16(utfchar32_t cp, word_iterator result) { + if (is_in_bmp(cp)) + *(result++) = static_cast(cp); + else { + // Code points from the supplementary planes are encoded via surrogate pairs + *(result++) = static_cast(LEAD_OFFSET + (cp >> 10)); + *(result++) = static_cast(TRAIL_SURROGATE_MIN + (cp & 0x3FF)); + } + return result; + } + + // Hopefully, most common case: the caller uses back_inserter + // i.e. append16(cp, std::back_inserter(str)); + template + std::back_insert_iterator append16 + (utfchar32_t cp, std::back_insert_iterator result) { + return append16, + typename container_type::value_type>(cp, result); + } + + // The caller uses some other kind of output operator - not covered above + // Note that in this case we are not able to determine word_type + // so we assume it's utfchar16_t; that can cause a conversion warning if we are wrong. + template + word_iterator append16(utfchar32_t cp, word_iterator result) { + return append16(cp, result); + } + +} // namespace internal + + /// The library API - functions intended to be called by the users + + // Byte order mark + const utfchar8_t bom[] = {0xef, 0xbb, 0xbf}; + + template + octet_iterator find_invalid(octet_iterator start, octet_iterator end) + { + octet_iterator result = start; + while (result != end) { + utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end); + if (err_code != internal::UTF8_OK) + return result; + } + return result; + } + + inline const char* find_invalid(const char* str) + { + const char* end = str + std::strlen(str); + return find_invalid(str, end); + } + + inline std::size_t find_invalid(const std::string& s) + { + std::string::const_iterator invalid = find_invalid(s.begin(), s.end()); + return (invalid == s.end()) ? std::string::npos : static_cast(invalid - s.begin()); + } + + template + inline bool is_valid(octet_iterator start, octet_iterator end) + { + return (utf8::find_invalid(start, end) == end); + } + + inline bool is_valid(const char* str) + { + return (*(utf8::find_invalid(str)) == '\0'); + } + + inline bool is_valid(const std::string& s) + { + return is_valid(s.begin(), s.end()); + } + + + + template + inline bool starts_with_bom (octet_iterator it, octet_iterator end) + { + return ( + ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) && + ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) && + ((it != end) && (utf8::internal::mask8(*it)) == bom[2]) + ); + } + + inline bool starts_with_bom(const std::string& s) + { + return starts_with_bom(s.begin(), s.end()); + } +} // namespace utf8 + +#endif // header guard + + diff --git a/src/simd/utf8/cpp11.h b/src/simd/utf8/cpp11.h new file mode 100644 index 000000000..691633c84 --- /dev/null +++ b/src/simd/utf8/cpp11.h @@ -0,0 +1,70 @@ +// Copyright 2018 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_a184c22c_d012_11e8_a8d5_f2801f1b9fd1 +#define UTF8_FOR_CPP_a184c22c_d012_11e8_a8d5_f2801f1b9fd1 + +#include "checked.h" + +namespace utf8 +{ + inline void append16(utfchar32_t cp, std::u16string& s) + { + append16(cp, std::back_inserter(s)); + } + + inline std::string utf16to8(const std::u16string& s) + { + std::string result; + utf16to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u16string utf8to16(const std::string& s) + { + std::u16string result; + utf8to16(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::string utf32to8(const std::u32string& s) + { + std::string result; + utf32to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u32string utf8to32(const std::string& s) + { + std::u32string result; + utf8to32(s.begin(), s.end(), std::back_inserter(result)); + return result; + } +} // namespace utf8 + +#endif // header guard + diff --git a/src/simd/utf8/cpp17.h b/src/simd/utf8/cpp17.h new file mode 100644 index 000000000..6e2fcc238 --- /dev/null +++ b/src/simd/utf8/cpp17.h @@ -0,0 +1,96 @@ +// Copyright 2018 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_7e906c01_03a3_4daf_b420_ea7ea952b3c9 +#define UTF8_FOR_CPP_7e906c01_03a3_4daf_b420_ea7ea952b3c9 + +#include "cpp11.h" + +namespace utf8 +{ + inline std::string utf16to8(std::u16string_view s) + { + std::string result; + utf16to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u16string utf8to16(std::string_view s) + { + std::u16string result; + utf8to16(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::string utf32to8(std::u32string_view s) + { + std::string result; + utf32to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u32string utf8to32(std::string_view s) + { + std::u32string result; + utf8to32(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::size_t find_invalid(std::string_view s) + { + std::string_view::const_iterator invalid = find_invalid(s.begin(), s.end()); + return (invalid == s.end()) ? std::string_view::npos : static_cast(invalid - s.begin()); + } + + inline bool is_valid(std::string_view s) + { + return is_valid(s.begin(), s.end()); + } + + inline std::string replace_invalid(std::string_view s, char32_t replacement) + { + std::string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); + return result; + } + + inline std::string replace_invalid(std::string_view s) + { + std::string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline bool starts_with_bom(std::string_view s) + { + return starts_with_bom(s.begin(), s.end()); + } + +} // namespace utf8 + +#endif // header guard + diff --git a/src/simd/utf8/cpp20.h b/src/simd/utf8/cpp20.h new file mode 100644 index 000000000..07b61d0fb --- /dev/null +++ b/src/simd/utf8/cpp20.h @@ -0,0 +1,124 @@ +// Copyright 2022 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_207e906c01_03a3_4daf_b420_ea7ea952b3c9 +#define UTF8_FOR_CPP_207e906c01_03a3_4daf_b420_ea7ea952b3c9 + +#include "cpp17.h" + +namespace utf8 +{ + inline std::u8string utf16tou8(const std::u16string& s) + { + std::u8string result; + utf16to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u8string utf16tou8(std::u16string_view s) + { + std::u8string result; + utf16to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u16string utf8to16(const std::u8string& s) + { + std::u16string result; + utf8to16(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u16string utf8to16(const std::u8string_view& s) + { + std::u16string result; + utf8to16(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u8string utf32tou8(const std::u32string& s) + { + std::u8string result; + utf32to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u8string utf32tou8(const std::u32string_view& s) + { + std::u8string result; + utf32to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u32string utf8to32(const std::u8string& s) + { + std::u32string result; + utf8to32(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u32string utf8to32(const std::u8string_view& s) + { + std::u32string result; + utf8to32(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::size_t find_invalid(const std::u8string& s) + { + std::u8string::const_iterator invalid = find_invalid(s.begin(), s.end()); + return (invalid == s.end()) ? std::string_view::npos : static_cast(invalid - s.begin()); + } + + inline bool is_valid(const std::u8string& s) + { + return is_valid(s.begin(), s.end()); + } + + inline std::u8string replace_invalid(const std::u8string& s, char32_t replacement) + { + std::u8string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); + return result; + } + + inline std::u8string replace_invalid(const std::u8string& s) + { + std::u8string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline bool starts_with_bom(const std::u8string& s) + { + return starts_with_bom(s.begin(), s.end()); + } + +} // namespace utf8 + +#endif // header guard + diff --git a/src/simd/utf8/unchecked.h b/src/simd/utf8/unchecked.h new file mode 100644 index 000000000..65d4948f2 --- /dev/null +++ b/src/simd/utf8/unchecked.h @@ -0,0 +1,287 @@ +// Copyright 2006 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +#include "core.h" + +namespace utf8 +{ + namespace unchecked + { + template + octet_iterator append(utfchar32_t cp, octet_iterator result) + { + return internal::append(cp, result); + } + + template + word_iterator append16(utfchar32_t cp, word_iterator result) + { + return internal::append16(cp, result); + } + + template + output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement) + { + while (start != end) { + octet_iterator sequence_start = start; + internal::utf_error err_code = utf8::internal::validate_next(start, end); + switch (err_code) { + case internal::UTF8_OK : + for (octet_iterator it = sequence_start; it != start; ++it) + *out++ = *it; + break; + case internal::NOT_ENOUGH_ROOM: + out = utf8::unchecked::append(replacement, out); + start = end; + break; + case internal::INVALID_LEAD: + out = utf8::unchecked::append(replacement, out); + ++start; + break; + case internal::INCOMPLETE_SEQUENCE: + case internal::OVERLONG_SEQUENCE: + case internal::INVALID_CODE_POINT: + out = utf8::unchecked::append(replacement, out); + ++start; + // just one replacement mark for the sequence + while (start != end && utf8::internal::is_trail(*start)) + ++start; + break; + } + } + return out; + } + + template + inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) + { + static const utfchar32_t replacement_marker = utf8::internal::mask16(0xfffd); + return utf8::unchecked::replace_invalid(start, end, out, replacement_marker); + } + + inline std::string replace_invalid(const std::string& s, utfchar32_t replacement) + { + std::string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); + return result; + } + + inline std::string replace_invalid(const std::string& s) + { + std::string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + template + utfchar32_t next(octet_iterator& it) + { + utfchar32_t cp = utf8::internal::mask8(*it); + switch (utf8::internal::sequence_length(it)) { + case 1: + break; + case 2: + it++; + cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f); + break; + case 3: + ++it; + cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); + ++it; + cp += (*it) & 0x3f; + break; + case 4: + ++it; + cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); + ++it; + cp += (utf8::internal::mask8(*it) << 6) & 0xfff; + ++it; + cp += (*it) & 0x3f; + break; + } + ++it; + return cp; + } + + template + utfchar32_t peek_next(octet_iterator it) + { + return utf8::unchecked::next(it); + } + + template + utfchar32_t next16(word_iterator& it) + { + utfchar32_t cp = utf8::internal::mask16(*it++); + if (utf8::internal::is_lead_surrogate(cp)) + return (cp << 10) + *it++ + utf8::internal::SURROGATE_OFFSET; + return cp; + } + + template + utfchar32_t prior(octet_iterator& it) + { + while (utf8::internal::is_trail(*(--it))) ; + octet_iterator temp = it; + return utf8::unchecked::next(temp); + } + + template + void advance(octet_iterator& it, distance_type n) + { + const distance_type zero(0); + if (n < zero) { + // backward + for (distance_type i = n; i < zero; ++i) + utf8::unchecked::prior(it); + } else { + // forward + for (distance_type i = zero; i < n; ++i) + utf8::unchecked::next(it); + } + } + + template + typename std::iterator_traits::difference_type + distance(octet_iterator first, octet_iterator last) + { + typename std::iterator_traits::difference_type dist; + for (dist = 0; first < last; ++dist) + utf8::unchecked::next(first); + return dist; + } + + template + octet_iterator utf16to8(u16bit_iterator start, u16bit_iterator end, octet_iterator result) + { + while (start != end) { + utfchar32_t cp = utf8::internal::mask16(*start++); + // Take care of surrogate pairs first + if (utf8::internal::is_lead_surrogate(cp)) { + if (start == end) + return result; + utfchar32_t trail_surrogate = utf8::internal::mask16(*start++); + cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; + } + result = utf8::unchecked::append(cp, result); + } + return result; + } + + template + u16bit_iterator utf8to16(octet_iterator start, octet_iterator end, u16bit_iterator result) + { + while (start < end) { + utfchar32_t cp = utf8::unchecked::next(start); + if (cp > 0xffff) { //make a surrogate pair + *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); + *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); + } + else + *result++ = static_cast(cp); + } + return result; + } + + template + octet_iterator utf32to8(u32bit_iterator start, u32bit_iterator end, octet_iterator result) + { + while (start != end) + result = utf8::unchecked::append(*(start++), result); + + return result; + } + + template + u32bit_iterator utf8to32(octet_iterator start, octet_iterator end, u32bit_iterator result) + { + while (start < end) + (*result++) = utf8::unchecked::next(start); + + return result; + } + + // The iterator class + template + class iterator { + octet_iterator it; + public: + typedef utfchar32_t value_type; + typedef utfchar32_t* pointer; + typedef utfchar32_t& reference; + typedef std::ptrdiff_t difference_type; + typedef std::bidirectional_iterator_tag iterator_category; + iterator () {} + explicit iterator (const octet_iterator& octet_it): it(octet_it) {} + // the default "big three" are OK + octet_iterator base () const { return it; } + utfchar32_t operator * () const + { + octet_iterator temp = it; + return utf8::unchecked::next(temp); + } + bool operator == (const iterator& rhs) const + { + return (it == rhs.it); + } + bool operator != (const iterator& rhs) const + { + return !(operator == (rhs)); + } + iterator& operator ++ () + { + ::std::advance(it, utf8::internal::sequence_length(it)); + return *this; + } + iterator operator ++ (int) + { + iterator temp = *this; + ::std::advance(it, utf8::internal::sequence_length(it)); + return temp; + } + iterator& operator -- () + { + utf8::unchecked::prior(it); + return *this; + } + iterator operator -- (int) + { + iterator temp = *this; + utf8::unchecked::prior(it); + return temp; + } + }; // class iterator + + } // namespace utf8::unchecked +} // namespace utf8 + + +#endif // header guard + diff --git a/src/simd/vt.cpp b/src/simd/vt.cpp new file mode 100644 index 000000000..fe4d8a1fa --- /dev/null +++ b/src/simd/vt.cpp @@ -0,0 +1,168 @@ +// Generates code for every target that this compiler can support. +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "simd/vt.cpp" // this file +#include // must come before highway.h +#include + +#include +#include +#include +#include +#include + +HWY_BEFORE_NAMESPACE(); +namespace ghostty { +namespace HWY_NAMESPACE { + +namespace hn = hwy::HWY_NAMESPACE; + +using T = uint8_t; + +// Decode the UTF-8 text in input into output. Returns the number of decoded +// characters. This function assumes output is large enough. +// +// This function handles malformed UTF-8 sequences by inserting a +// replacement character (U+FFFD) and continuing to decode. This function +// will consume the entire input no matter what. +size_t DecodeUTF8(const uint8_t* HWY_RESTRICT input, + size_t count, + char32_t* output) { + // Its possible for our input to be empty since DecodeUTF8UntilControlSeq + // doesn't check for this. + if (count == 0) { + return 0; + } + + // Assume no errors for fast path. + const size_t decoded = simdutf::convert_utf8_to_utf32( + reinterpret_cast(input), count, output); + if (decoded > 0) { + return decoded; + } + + // Errors in the UTF input, take a slow path and do a decode with + // replacement (with U+FFFD). Note that simdutf doesn't have a + // decode with replacement API: + // https://github.com/simdutf/simdutf/issues/147 + // + // Because of this, we use a separate library with heap allocation + // that is much, much slower (the allocation is slower, the algorithm + // is slower, etc.) This is just so we have something that works. + // I want to replace this. + std::vector replacement_result; + utf8::replace_invalid(input, input + count, + std::back_inserter(replacement_result), 0xFFFD); + return DecodeUTF8(reinterpret_cast(replacement_result.data()), + replacement_result.size(), output); +} + +/// Decode the UTF-8 text in input into output until an escape +/// character is found. This returns the number of bytes consumed +/// from input and writes the number of decoded characters into +/// output_count. +/// +/// This may return a value less than count even with no escape +/// character if the input ends with an incomplete UTF-8 sequence. +/// The caller should check the next byte manually to determine +/// if it is incomplete. +template +size_t DecodeUTF8UntilControlSeqImpl(D d, + const T* HWY_RESTRICT input, + size_t count, + char32_t* output, + size_t* output_count) { + const size_t N = hn::Lanes(d); + + // Create a vector containing ESC since that denotes a control sequence. + const hn::Vec esc_vec = Set(d, 0x1B); + + // Compare N elements at a time. + size_t i = 0; + for (; i + N <= count; i += N) { + // Load the N elements from our input into a vector. + const hn::Vec input_vec = hn::LoadU(d, input + i); + + // If we don't have any escapes we keep going. We want to accumulate + // the largest possible valid UTF-8 sequence before decoding. + // TODO(mitchellh): benchmark this vs decoding every time + const auto esc_idx = IndexOfChunk(d, esc_vec, input_vec); + if (!esc_idx) { + continue; + } + + // We have an ESC char, decode up to this point. We start by assuming + // a valid UTF-8 sequence and slow-path into error handling if we find + // an invalid sequence. + *output_count = DecodeUTF8(input, i + esc_idx.value(), output); + return i + esc_idx.value(); + } + + // If we have leftover input then we decode it one byte at a time (slow!) + // using pretty much the same logic as above. + if (i != count) { + const hn::CappedTag d1; + using D1 = decltype(d1); + const hn::Vec esc1 = Set(d1, GetLane(esc_vec)); + for (; i < count; ++i) { + const hn::Vec input_vec = hn::LoadU(d1, input + i); + const auto esc_idx = IndexOfChunk(d1, esc1, input_vec); + if (!esc_idx) { + continue; + } + + *output_count = DecodeUTF8(input, i + esc_idx.value(), output); + return i + esc_idx.value(); + } + } + + // If we reached this point, its possible for our input to have an + // incomplete sequence because we're consuming the full input. We need + // to trim any incomplete sequences from the end of the input. + const size_t trimmed_len = + simdutf::trim_partial_utf8(reinterpret_cast(input), i); + *output_count = DecodeUTF8(input, trimmed_len, output); + return trimmed_len; +} + +size_t DecodeUTF8UntilControlSeq(const uint8_t* HWY_RESTRICT input, + size_t count, + char32_t* output, + size_t* output_count) { + const hn::ScalableTag d; + return DecodeUTF8UntilControlSeqImpl(d, input, count, output, output_count); +} + +} // namespace HWY_NAMESPACE +} // namespace ghostty +HWY_AFTER_NAMESPACE(); + +// HWY_ONCE is true for only one of the target passes +#if HWY_ONCE + +namespace ghostty { + +HWY_EXPORT(DecodeUTF8UntilControlSeq); + +size_t DecodeUTF8UntilControlSeq(const uint8_t* HWY_RESTRICT input, + size_t count, + char32_t* output, + size_t* output_count) { + return HWY_DYNAMIC_DISPATCH(DecodeUTF8UntilControlSeq)(input, count, output, + output_count); +} + +} // namespace ghostty + +extern "C" { + +size_t ghostty_simd_decode_utf8_until_control_seq(const uint8_t* HWY_RESTRICT + input, + size_t count, + char32_t* output, + size_t* output_count) { + return ghostty::DecodeUTF8UntilControlSeq(input, count, output, output_count); +} + +} // extern "C" + +#endif // HWY_ONCE diff --git a/src/simd/vt.h b/src/simd/vt.h new file mode 100644 index 000000000..c54ae93f8 --- /dev/null +++ b/src/simd/vt.h @@ -0,0 +1,30 @@ +#if defined(GHOSTTY_SIMD_VT_H_) == defined(HWY_TARGET_TOGGLE) +#ifdef GHOSTTY_SIMD_VT_H_ +#undef GHOSTTY_SIMD_VT_H_ +#else +#define GHOSTTY_SIMD_VT_H_ +#endif + +#include + +HWY_BEFORE_NAMESPACE(); +namespace ghostty { +namespace HWY_NAMESPACE { + +namespace hn = hwy::HWY_NAMESPACE; + +} // namespace HWY_NAMESPACE +} // namespace ghostty +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace ghostty { + +typedef void (*PrintFunc)(const char32_t* chars, size_t count); + +} // namespace ghostty + +#endif // HWY_ONCE + +#endif // GHOSTTY_SIMD_VT_H_ diff --git a/src/simd/vt.zig b/src/simd/vt.zig new file mode 100644 index 000000000..2e59957c1 --- /dev/null +++ b/src/simd/vt.zig @@ -0,0 +1,121 @@ +const std = @import("std"); + +// vt.cpp +extern "c" fn ghostty_simd_decode_utf8_until_control_seq( + input: [*]const u8, + count: usize, + output: [*]u32, + output_count: *usize, +) usize; + +const DecodeResult = struct { + consumed: usize, + decoded: usize, +}; + +pub fn utf8DecodeUntilControlSeq( + input: []const u8, + output: []u32, +) DecodeResult { + var decoded: usize = 0; + const consumed = ghostty_simd_decode_utf8_until_control_seq( + input.ptr, + input.len, + output.ptr, + &decoded, + ); + + return .{ .consumed = consumed, .decoded = decoded }; +} + +test "decode no escape" { + const testing = std.testing; + + var output: [1024]u32 = undefined; + + // TODO: many more test cases + { + const str = "hello" ** 128; + try testing.expectEqual(DecodeResult{ + .consumed = str.len, + .decoded = str.len, + }, utf8DecodeUntilControlSeq(str, &output)); + } +} + +test "decode ASCII to escape" { + const testing = std.testing; + + var output: [1024]u32 = undefined; + + // TODO: many more test cases + { + const prefix = "hello" ** 64; + const str = prefix ++ "\x1b" ++ ("world" ** 64); + try testing.expectEqual(DecodeResult{ + .consumed = prefix.len, + .decoded = prefix.len, + }, utf8DecodeUntilControlSeq(str, &output)); + } +} + +test "decode immediate esc sequence" { + const testing = std.testing; + + var output: [64]u32 = undefined; + const str = "\x1b[?5s"; + try testing.expectEqual(DecodeResult{ + .consumed = 0, + .decoded = 0, + }, utf8DecodeUntilControlSeq(str, &output)); +} + +test "decode incomplete UTF-8" { + const testing = std.testing; + + var output: [64]u32 = undefined; + + // 2-byte + { + const str = "hello\xc2"; + try testing.expectEqual(DecodeResult{ + .consumed = 5, + .decoded = 5, + }, utf8DecodeUntilControlSeq(str, &output)); + } + + // 3-byte + { + const str = "hello\xe0\x00"; + try testing.expectEqual(DecodeResult{ + .consumed = 5, + .decoded = 5, + }, utf8DecodeUntilControlSeq(str, &output)); + } + + // 4-byte + { + const str = "hello\xf0\x90"; + try testing.expectEqual(DecodeResult{ + .consumed = 5, + .decoded = 5, + }, utf8DecodeUntilControlSeq(str, &output)); + } +} + +test "decode invalid UTF-8" { + const testing = std.testing; + + var output: [64]u32 = undefined; + + // Invalid leading 1s + { + const str = "hello\xc2\x00"; + try testing.expectEqual(DecodeResult{ + .consumed = 7, + .decoded = 7, + }, utf8DecodeUntilControlSeq(str, &output)); + } + + try testing.expectEqual(@as(u32, 0xFFFD), output[5]); +}