blob: 7c668f91655322dba7d103f82922a18c5b4e38c1 [file] [log] [blame] [edit]
#include "sutf.h"
#include <vector>
#include <string>
#include <set>
#include <filesystem>
#include <iterator>
#include <climits>
#include <cerrno>
CommandLine parse_and_validate_arguments(int argc, char* argv[]) {
CommandLine cmdline;
std::vector<std::string> arguments;
for (int i=1; i < argc; i++) {
std::string arg{argv[i]};
if ((arg == "-h") || (arg == "--help")) {
CommandLine::show_help();
return cmdline;
}
else if ((arg == "-u") || (arg == "--usage")) {
CommandLine::show_usage();
return cmdline;
}
else if ((arg == "-l") || (arg == "--list")) {
CommandLine::show_formats();
return cmdline;
}
else {
arguments.push_back(std::move(arg));
}
}
bool has_from_arg = false;
bool has_to_arg = false;
bool has_output_arg = false;
for (size_t i=0; i < arguments.size();) {
const std::string& arg = arguments[i];
size_t size = arg.size();
if (arg == "-f") {
if (has_from_arg) {
throw std::invalid_argument("Only a single source encoding argument is allowed.");
}
const std::string& value = arguments.at(i + 1);
cmdline.from_encoding = value;
has_from_arg = true;
i += 2;
}
else if (arg == "-t") {
if (has_to_arg) {
throw std::invalid_argument("Only a single destination encoding argument is allowed.");
}
const std::string& value = arguments.at(i + 1);
cmdline.to_encoding = value;
has_to_arg = true;
i += 2;
}
else if (arg == "-o") {
if (has_output_arg) {
throw std::invalid_argument("Only a single output file is allowed.");
}
has_output_arg = true;
const std::string& value = arguments.at(i + 1);
cmdline.output_file = value;
i += 2;
}
else if (size > 12 && arg.substr(0, 12) == "--from-code=") {
if (has_from_arg) {
throw std::invalid_argument("Only a single source encoding argument is allowed.");
}
cmdline.from_encoding = arg.substr(12);
has_from_arg = true;
i++;
}
else if (size > 10 && arg.substr(0, 10) == "--to-code=") {
if (has_to_arg) {
throw std::invalid_argument("Only a single destination encoding argument is allowed.");
}
cmdline.to_encoding = arg.substr(10);
has_to_arg = true;
i++;
}
else if (size > 9 && arg.substr(0, 9) == "--output=") {
if (has_output_arg) {
throw std::invalid_argument("Only a single output file is allowed.");
}
cmdline.output_file = arg.substr(9);
has_output_arg = true;
i++;
}
else {
if (! std::filesystem::exists(arg)) {
throw std::runtime_error("File " + arg + " does not exist.");
}
cmdline.input_files.push(arg);
i++;
}
}
if (!has_from_arg || !has_to_arg) {
throw std::invalid_argument("Missing -f ENCODING/--from_code=ENCODING or -t ENCODING/--to_code=ENCODING argument(s).");
}
return cmdline;
}
void CommandLine::run() {
if (output_file.empty()) {
run_procedure(stdout);
} else {
SIMDUTF_PUSH_DISABLE_WARNINGS
SIMDUTF_DISABLE_DEPRECATED_WARNING
std::FILE *fp = std::fopen(output_file.string().c_str(), "wb");
SIMDUTF_POP_DISABLE_WARNINGS
if (fp == NULL) {
printf("Could not open %s\n", output_file.string().c_str());
return;
}
run_procedure(fp);
if(fclose(fp) != 0) {
printf("Failed to close %s\n", output_file.string().c_str());
}
}
}
void CommandLine::run_procedure(std::FILE *fpout) {
if (from_encoding == "UTF-8") {
if (to_encoding == "UTF-16LE" || to_encoding == "UTF-16") {
auto proc = [this, &fpout](size_t size) {
if(!(input_files.empty())) { size = find_last_leading_byte(size); }
size_t len = simdutf::convert_utf8_to_utf16le(reinterpret_cast<const char*>(input_data.data()), size, reinterpret_cast<char16_t*>(output_buffer.data()));
if (len == 0 && size != 0) { printf("Could not convert %s\n", input_files.front().string().c_str()); input_files.pop(); }
else { write_to_file_descriptor(fpout, output_buffer.data(), len * sizeof(char16_t)); }
return size;
};
run_simdutf_procedure(proc);
} else if (to_encoding == "UTF-16BE") {
auto proc = [this, &fpout](size_t size) {
if(!(input_files.empty())) { size = find_last_leading_byte(size); }
size_t len = simdutf::convert_utf8_to_utf16be(reinterpret_cast<const char*>(input_data.data()), size, reinterpret_cast<char16_t*>(output_buffer.data()));
if (len == 0 && size != 0) { printf("Could not convert %s\n", input_files.front().string().c_str()); input_files.pop(); }
else { write_to_file_descriptor(fpout, output_buffer.data(), len * sizeof(char16_t)); }
return size;
};
run_simdutf_procedure(proc);
} else if (to_encoding == "UTF-32LE" || to_encoding == "UTF-32") {
auto proc = [this, &fpout](size_t size) {
if(!(input_files.empty())) { size = find_last_leading_byte(size); }
size_t len = simdutf::convert_utf8_to_utf32(reinterpret_cast<const char*>(input_data.data()), size, reinterpret_cast<char32_t*>(output_buffer.data()));
if (len == 0 && size != 0) { printf("Could not convert %s\n", input_files.front().string().c_str()); input_files.pop(); }
else { write_to_file_descriptor(fpout, output_buffer.data(), len * sizeof(char32_t)); }
return size;
};
run_simdutf_procedure(proc);
} else {
iconv_fallback(fpout);
}
}
else if (from_encoding == "UTF-16LE" || from_encoding == "UTF-16") {
if (to_encoding == "UTF-8") {
auto proc = [this, &fpout](size_t size_bytes) {
if (!(input_files.empty()) && (input_data[size_bytes-1] & 0xfc) == 0xd8) { size_bytes -= 2; } // Check if last word is a high surrogate
const size_t size = size_bytes / 2;
size_t len = simdutf::convert_utf16le_to_utf8(reinterpret_cast<const char16_t*>(input_data.data()), size, output_buffer.data());
if (len == 0 && size != 0) { printf("Could not convert %s\n", input_files.front().string().c_str()); input_files.pop(); }
else { write_to_file_descriptor(fpout, output_buffer.data(), len * sizeof(char)); }
return size_bytes;
};
run_simdutf_procedure(proc);
} else if (to_encoding == "UTF-16BE") {
auto proc = [this, &fpout](size_t size_bytes) {
const size_t size = size_bytes / 2;
simdutf::change_endianness_utf16(reinterpret_cast<const char16_t*>(input_data.data()), size, reinterpret_cast<char16_t*>(output_buffer.data()));
write_to_file_descriptor(fpout, output_buffer.data(), size);
return size_bytes;
};
run_simdutf_procedure(proc);
} else if (to_encoding == "UTF-32LE" || to_encoding == "UTF-32") {
auto proc = [this, &fpout](size_t size_bytes) {
if (!(input_files.empty()) && (input_data[size_bytes-1] & 0xfc) == 0xd8) { size_bytes -= 2; } // Check if last word is a high surrogate
const size_t size = size_bytes / 2;
size_t len = simdutf::convert_utf16le_to_utf32(reinterpret_cast<const char16_t*>(input_data.data()), size, reinterpret_cast<char32_t*>(output_buffer.data()));
if (len == 0 && size != 0) { printf("Could not convert %s\n", input_files.front().string().c_str()); input_files.pop(); }
else { write_to_file_descriptor(fpout, output_buffer.data(), len * sizeof(char32_t)); }
return size_bytes;
};
run_simdutf_procedure(proc);
} else {
iconv_fallback(fpout);
}
}
else if (from_encoding == "UTF-16BE") {
if (to_encoding == "UTF-8") {
auto proc = [this, &fpout](size_t size_bytes) {
if ((input_files.empty()) && (input_data[size_bytes-2] & 0xfc) == 0xd8) { size_bytes -= 2; } // Check if last word is a high surrogate
const size_t size = size_bytes / 2;
size_t len = simdutf::convert_utf16be_to_utf8(reinterpret_cast<const char16_t*>(input_data.data()), size, output_buffer.data());
if (len == 0 && size != 0) { printf("Could not convert %s\n", input_files.front().string().c_str()); input_files.pop(); }
else { write_to_file_descriptor(fpout, output_buffer.data(), len * sizeof(char)); }
return size_bytes;
};
run_simdutf_procedure(proc);
} else if (to_encoding == "UTF-16LE" || to_encoding == "UTF-16") {
auto proc = [this, &fpout](size_t size_bytes) {
const size_t size = size_bytes / 2;
simdutf::change_endianness_utf16(reinterpret_cast<const char16_t*>(input_data.data()), size, reinterpret_cast<char16_t*>(output_buffer.data()));
write_to_file_descriptor(fpout, output_buffer.data(), size);
return size_bytes;
};
run_simdutf_procedure(proc);
} else if (to_encoding == "UTF-32LE" || to_encoding == "UTF-32") {
auto proc = [this, &fpout](size_t size_bytes) {
if (!(input_files.empty()) && (input_data[size_bytes-2] & 0xfc) == 0xd8) { size_bytes -= 2; } // Check if last word is a high surrogate
const size_t size = size_bytes / 2;
size_t len = simdutf::convert_utf16be_to_utf32(reinterpret_cast<const char16_t*>(input_data.data()), size, reinterpret_cast<char32_t*>(output_buffer.data()));
if (len == 0 && size != 0) { printf("Could not convert %s\n", input_files.front().string().c_str()); input_files.pop(); }
else { write_to_file_descriptor(fpout, output_buffer.data(), len * sizeof(char32_t)); }
return size_bytes;
};
run_simdutf_procedure(proc);
} else {
iconv_fallback(fpout);
}
}
else if (from_encoding == "UTF-32LE" || from_encoding == "UTF-32") {
if (to_encoding == "UTF-8") {
auto proc = [this, &fpout](size_t size_bytes) {
const size_t size = size_bytes / 4;
size_t len = simdutf::convert_utf32_to_utf8(reinterpret_cast<const char32_t*>(input_data.data()), size, output_buffer.data());
if (len == 0 && size != 0) { printf("Could not convert %s\n", input_files.front().string().c_str()); input_files.pop(); }
else { write_to_file_descriptor(fpout, output_buffer.data(), len * sizeof(char)); }
return size_bytes;
};
run_simdutf_procedure(proc);
} else if (to_encoding == "UTF-16LE" || to_encoding == "UTF-16") {
auto proc = [this, &fpout](size_t size_bytes) {
const size_t size = size_bytes / 4;
size_t len = simdutf::convert_utf32_to_utf16le(reinterpret_cast<const char32_t*>(input_data.data()), size, reinterpret_cast<char16_t*>(output_buffer.data()));
if (len == 0 && size != 0) { printf("Could not convert %s\n", input_files.front().string().c_str()); input_files.pop(); }
else { write_to_file_descriptor(fpout, output_buffer.data(), len * sizeof(char16_t)); }
return size_bytes;
};
run_simdutf_procedure(proc);
} else if (to_encoding == "UTF-16BE") {
auto proc = [this, &fpout](size_t size_bytes) {
const size_t size = size_bytes / 4;
size_t len = simdutf::convert_utf32_to_utf16be(reinterpret_cast<const char32_t*>(input_data.data()), size, reinterpret_cast<char16_t*>(output_buffer.data()));
if (len == 0 && size != 0) { printf("Could not convert %s\n", input_files.front().string().c_str()); input_files.pop(); }
else { write_to_file_descriptor(fpout, output_buffer.data(), len * sizeof(char16_t)); }
return size_bytes;
};
run_simdutf_procedure(proc);
} else {
iconv_fallback(fpout);
}
}
else {
iconv_fallback(fpout);
}
}
// PROCEDURE takes as parameter the number of bytes to consume in input_data (from the start of input_data).
// PROCEDURE consumes from the start of input_data buffer.
// PROCEDURE returns the number of bytes consumed.
template <typename PROCEDURE>
void CommandLine::run_simdutf_procedure(PROCEDURE proc) {
size_t leftovers{0};
while(!(input_files.empty())) {
size_t input_size = leftovers;
if(!load_chunk(&input_size)) { printf("Could not load %s\n", input_files.front().string().c_str()); input_files.pop(); continue; }
leftovers = input_size - proc(input_size);
// Copy leftover bytes to the start of input_data
for (int i = 0; i < leftovers; i++) {
input_data[i] = input_data[input_size-leftovers+i];
}
}
}
void CommandLine::iconv_fallback(std::FILE *fpout) {
#if ICONV_AVAILABLE
iconv_t cv = iconv_open(to_encoding.c_str(), from_encoding.c_str());
if (cv == (iconv_t)(-1)) {
fprintf( stderr,"[iconv] cannot initialize %s to %s converter\n", from_encoding.c_str(), to_encoding.c_str());
return;
}
size_t leftovers{0};
while (!(input_files.empty())) {
size_t input_size{leftovers};
if(!load_chunk(&input_size)) { printf("Could not load %s\n", input_files.front().string().c_str()); input_files.pop(); continue; }
size_t inbytes = input_size;
size_t outbytes = sizeof(uint32_t) * inbytes;
size_t start_outbytes = outbytes;
// win-iconv includes WINICONV_CONST in its function signatures
// https://github.com/simdutf/simdutf/pull/178
#ifdef WINICONV_CONST
WINICONV_CONST char * inptr = reinterpret_cast<WINICONV_CONST char *>(input_data.data());
#else
char * inptr = reinterpret_cast<char *>(input_data.data());
#endif
char * outptr = reinterpret_cast<char *>(output_buffer.data());
size_t result = iconv(cv, &inptr, &inbytes, &outptr, &outbytes);
if (result == static_cast<size_t>(-1)) {
if (errno == EINVAL) { // Incomplete multibyte sequence error is ok
// Copy leftover bytes to the start of input_data
leftovers = inbytes;
for (int i = 0; i < leftovers; i++) {
input_data[i] = input_data[input_size-leftovers+i];
}
} else {
fprintf( stderr,"[iconv] Error iconv.\n");
return;
}
} else {
leftovers = 0;
}
write_to_file_descriptor(fpout, output_buffer.data(), start_outbytes - outbytes);
}
iconv_close(cv);
#else
fprintf( stderr, "Conversion from %s to %s is not supported by the simdutf library and iconv is not available.\n", from_encoding.c_str(), to_encoding.c_str());
show_formats();
#endif
}
bool CommandLine::write_to_file_descriptor(std::FILE *fp, const char * data, size_t length) {
if(fp == NULL) { return false; }
size_t bytes_written = std::fwrite(data, 1, length, fp);
if (bytes_written != length) { return false; }
return true;
}
// Loads a chunk of data (CHUNK_SIZE bytes). *input_size is the current input size and is updated to the new input size after.
bool CommandLine::load_chunk(size_t *input_size) {
size_t count = CHUNK_SIZE - *input_size;
while (count > 0) {
// Open a file if no file is opened
if (current_file == NULL) {
SIMDUTF_PUSH_DISABLE_WARNINGS
SIMDUTF_DISABLE_DEPRECATED_WARNING // Disable CRT_SECURE warning on MSVC: manually verified this is safe
current_file = std::fopen(input_files.front().string().c_str(), "rb");
SIMDUTF_POP_DISABLE_WARNINGS
if (current_file == NULL) { return false; }
}
// Try to read 'count' bytes
size_t bytes_read = std::fread(input_data.data() + *input_size, 1, count, current_file);
if (std::ferror(current_file)) { return false; }
count -= bytes_read; // 'count' should never be negative since we read at most 'count' bytes
*input_size += bytes_read; // input_size should never exceed count initial value
if (std::feof(current_file)) { // Check if current_file is done
if (std::fclose(current_file) != 0) { return false; }
input_files.pop();
current_file = NULL;
if (input_files.empty()) { break; }
}
}
return true;
}
// UNSAFE if size < 4 (should never happen when CHUNK_SIZE >= 4)
// Given the size of the input from the start of input_data, returns the size just before the last leading bytes
size_t CommandLine::find_last_leading_byte(size_t size) {
// A leading byte cannot be further than 3 bytes away from the end for valid input
for (int i = 0; i < 4; i++) {
size--;
if ((input_data[size] & 0b11000000) != 0b10000000) { break; }
}
return size;
}
void CommandLine::show_help() {
#if ICONV_AVAILABLE
printf("Convert encoding of input files from one encoding to another using simdutf library for supported formats and iconv for other formats.\n");
#else
printf("Convert encoding of input files from one encoding to another using simdutf library for supported formats.\n");
#endif
printf("Usage: sutf [OPTIONS...] [INPUTFILES...]\n\n");
printf(" Encoding specification(mandatory):\n"
" -f, --from-code=ENCODING encoding of source text\n"
" -t, --to-code=ENCODING encoding of output\n\n");
printf(" Output(optional):\n"
" -o,--output=FILE output file\n\n");
printf(" Information(optional):\n"
" -h,--help Display this help text\n"
" -u,--usage Display short usage message\n"
" -l,--list Display supported formats by simdutf library\n\n");
printf("If output is not specified, the output is redirected to standard output.\n");
}
void CommandLine::show_usage() {
printf("Usage: sutf [OPTION...] [-f ENCODING] [-t ENCODING] [-o OUTPUTFILE] [-l] [-h] [-u]\n"
" [--from-code=ENCODING] [--to-code=ENCODING] [--output=OUTPUTFILE] [--list] [--help] [--usage] [INPUTFILES...]\n");
}
void CommandLine::show_formats() {
printf("Formats supported by simdutf library: UTF-8, UTF-16LE, UTF-16BE, UTF-32LE\n");
#if ICONV_AVAILABLE
printf("Try \"iconv -l\" or \"iconv --list\" to see formats supported by iconv.\n");
#endif
}
int main(int argc, char* argv[]) {
try {
CommandLine cmdline = parse_and_validate_arguments(argc, argv);
cmdline.run();
return EXIT_SUCCESS;
} catch (const std::exception& e) {
printf("%s\n", e.what());
CommandLine::show_help();
return EXIT_FAILURE;
}
}