#include <string>
#include <vector>
#include <charconv>
#include <stdexcept>
std::vector<std::string> split(const std::string& str, const std::string& delim) {
std::vector<std::string> strings;
size_t start;
size_t end = 0;
while ((start = str.find_first_not_of(delim, end)) != std::string::npos) {
end = str.find(delim, start);
strings.push_back(str.substr(start, end - start));
}
return strings;
}
std::uint32_t utf8_encode(char* out, uint32_t utf)
{
if (utf <= 0x7F) {
// Plain ASCII
out[0] = (char)utf;
out[1] = 0;
return 1;
}
else if (utf <= 0x07FF) {
// 2-byte unicode
out[0] = (char)(((utf >> 6) & 0x1F) | 0xC0);
out[1] = (char)(((utf >> 0) & 0x3F) | 0x80);
out[2] = 0;
return 2;
}
else if (utf <= 0xFFFF) {
// 3-byte unicode
out[0] = (char)(((utf >> 12) & 0x0F) | 0xE0);
out[1] = (char)(((utf >> 6) & 0x3F) | 0x80);
out[2] = (char)(((utf >> 0) & 0x3F) | 0x80);
out[3] = 0;
return 3;
}
else if (utf <= 0x10FFFF) {
// 4-byte unicode
out[0] = (char)(((utf >> 18) & 0x07) | 0xF0);
out[1] = (char)(((utf >> 12) & 0x3F) | 0x80);
out[2] = (char)(((utf >> 6) & 0x3F) | 0x80);
out[3] = (char)(((utf >> 0) & 0x3F) | 0x80);
out[4] = 0;
return 4;
}
else {
// error - use replacement character
out[0] = (char)0xEF;
out[1] = (char)0xBF;
out[2] = (char)0xBD;
out[3] = 0;
return 0;
}
}
std::string cp_to_utf8(const std::string& input) {
using namespace std::string_literals;
std::string out;
for (auto i = 0u; i < input.size(); i++) {
std::string hex{};
auto first_two = input.substr(i, 2);
if (first_two == u8"\\u") {
hex = input.substr(i + 2, 4);
}
else if (first_two == u8"\\U") {
hex = input.substr(i + 2, 4);
}
if (!hex.empty()) {
unsigned int value;
auto [ptr, err] = std::from_chars(hex.c_str(), hex.c_str() + hex.size(), value, 16);
if (err == std::errc()) {
char buf[8];
auto len = utf8_encode(buf, value);
out.append(std::string{ buf, len });
}
else if (err == std::errc::invalid_argument)
{
throw std::invalid_argument{ "Bad unicode escape: "s + first_two + hex};
}
i += ptr - hex.c_str() + 1;
}
else {
unsigned char c = input[i];
int n;
if ((c & 0x80) == 0) n = 1;
else if ((c & 0xE0) == 0xC0) n = 2;
else if ((c & 0xF0) == 0xE0) n = 3;
else if ((c & 0xF8) == 0xF0) n = 4;
if (input.size() - i < n) {
throw std::invalid_argument{ "Invalid utf-8 string" };
}
for (unsigned int k = i + 1; k < i + n; ++k) {
if ((input[k] & 0xC0) != 0x80) {
throw std::runtime_error("Expected continuation byte at "s + std::to_string(i));
}
}
out += input.substr(i, n);
i += n - 1;
}
}
return out;
}